Skip to content

[Feature Request]: Type Checking & Error Reporting #344

Open
@yzb2

Description

@yzb2

Feature Name

Type Checking & Error Reporting

Feature Description

Type checking the annotations dataframe passed to run would have saved me a lot of time today:

fd = fastdup.create(input_dir=my_path)

fd.run(
    annotations=df,
    verbose=True,
)

Turns out, some of my col_x had two decimal places in it, rendering it not a number.

Another thing that would have saved me a lot of time is actual error reporting, instead of catching the error and reporting something arbitrary. The above, even with verbose set, just simply returned:

Reading regular file. Please make sure the command works in terminal.

If this error message was even in reference to my mistake (which I don't believe it is), what command am I ensuring works in the terminal? I don't know... The console doesn't know... Does anyone know?

This is a rather unacceptable and ambiguous error, as to bypass it I had to copy and paste your source code then delete the part that silently caught the error:

import json
import os
import tempfile
import warnings
from typing import List, Union, Tuple
import numpy as np
import pandas as pd
from pathlib import Path
import fastdup
from pandas.errors import EmptyDataError
import shutil
import fastdup.definitions as FD
#import boto3
from fastdup.sentry import v1_sentry_handler, fastdup_capture_exception, fastdup_capture_log_debug_state
from fastdup.definitions import FOLDER_FULL_IMAGE_RUN
import pathlib
import re

import fastdup

fd = fastdup.create(input_dir=my_path)

def set_fastdup_kwargs(input_kwargs: dict) -> dict:
    """
    override default arguments in fastdup args with users-input
    :param input_kwargs:iunput kwargs to init function
    :return: updated dict
    """
    fastdup_params = {
        'input_dir', 'work_dir', 'test_dir', 'compute', 'verbose', 'num_threads', 'num_images', 'distance',
        'threshold', 'lower_threshold', 'model_path', 'license', 'version', 'nearest_neighbors_k', 'd', 'run_mode',
        'nn_provider', 'min_offset', 'max_offset', 'nnf_mode', 'nnf_param', 'bounding_box', 'batch_size', 'resume',
        'high_accuracy'
    }
    turi_params = {
        'nnmodel': {'map': {'brute_force': 0, 'ball_tree': 1, 'lsh': 2}, 'default': 'brute_force'},
        'ccthreshold': {'map': None, 'default': 0.96},
        'run_cc': {'map': {True: 1, False: 0}, 'default': True},
        'run_sentry': {'map': {True: 1, False: 0}, 'default': True},
        'delete_tar': {'map': {True: 1, False: 0}, 'default': False},
        'delete_img': {'map': {True: 1, False: 0}, 'default': False},
        'tar_only': {'map': {True: 1, False: 0}, 'default': False},
        'run_stats': {'map': {True: 1, False: 0}, 'default': True},
        'run_stats_only': {'map': {True: 1, False: 0}, 'default': False},
        'run_advanced_stats': {'map': {True: 1, False: 0}, 'default': False},
        'sync_s3_to_local': {'map': {True: 1, False: 0}, 'default': False},
        'store_int': {'map': {True: 1, False: 0}, 'default': True},
        'shorten_filenames': {'map': {True: 1, False: 0}, 'default': False},
        'save_crops': {'map': {True: 1, False: 0}, 'default': False},
        'augmentation_horiz': {'map': None, 'default': 0.2},
        'augmentation_vert': {'map': None, 'default': 0.2},
        'augmentation_additive_margin': {'map': None, 'default': 0},
        'num_onnx_inter_threads': {'map': None, 'default': 0},
        'num_onnx_intra_threads': {'map': None, 'default': 0},
        'is_clip14_model':  {'map': {True: 1, False: 0}, 'default': False},
        #'run_labels': {'map': {True: 1, False: 0}, 'default': True},
        #'run_read_filenames': {'map': {True: 1, False: 0}, 'default': True},
        #'min_file_size': {'map': None, 'default': 0},
        #'read_features_parallel': {'map': None, 'default': 0},
        #'test_correction_offset': {'map': None, 'default': 0},
        #'max_augmentations': {'map': None, 'default': 1},
        #'augmentation_type': {'map': None, 'default': 0},
        #'is_ultraface_model': {'map': {True: 1, False: 0}, 'default': False},
        #'is_yolo_model': {'map': {True: 1, False: 0}, 'default': False},
        'min_input_image_height': {'map': None, 'default': 10},
        'min_input_image_width': {'map': None, 'default': 10},
        'save_thumbnails': {'map': {True: 1, False: 0}, 'default': False},
        'find_regex': {'map': None, 'default': ""},
        'no_sort': {'map': {True: 1, False: 0}, 'default': False},
        'quiet': {'map': {True: 1, False: 0}, 'default': False},
        'fastdup_ocr_lang': {'map': None, 'default': "en"},
        'fastdup_ocr_no_crop': {'map': {True: 1, False: 0}, 'default': False}
    }

    for key, value in input_kwargs.items():
        if key not in fastdup_params and key not in turi_params:
            raise ValueError(f'invalid argument {key}, allowed fastdup params are {fastdup_params}, allowed turi_param values are {turi_params}')

    turi_kwargs = []
    for arg_name, param in turi_params.items():
        map_dict = param['map']
        map_func = lambda x: x if map_dict is None else map_dict[x]
        value = input_kwargs.get(arg_name, param['default'])
        turi_kwargs.append(f'{arg_name}={map_func(value)}')

    fastdup_kwargs = {key: value for key, value in input_kwargs.items() if key in fastdup_params}
    fastdup_kwargs['turi_param'] = ','.join(turi_kwargs)
    return fastdup_kwargs

def run(
        input_dir: Union[str, Path] = None, 
        annotations: Union[pd.DataFrame,list] = None, 
        subset: list = None,
        embeddings=None, 
        data_type: str = FD.IMG, 
        overwrite: bool = False,
        print_summary: bool = True, 
        print_vl_datasets_ref: bool = True, 
        **fastdup_kwargs
    ):
    """
    This function
        1. calculate subset of images to analyze
        2. run fastdup
        3. map images/bboxes to fastdup index (on bbox this is done in self._set_fastdup_input)
        4. expand annotation csv to include files that are not in annotation but is in subset
        5. create a version of annotation that is grouped by image
    :param input_dir: input directory containing images
    :param annotations: (Optional) annotations dataframe, the expected column convention is:
            - filename: input_dir-relative filenames
            - img_h, img_w (Optional): image height and width
            - bbox_x, bbox_y, bbox_h, bbox_w (Optional): bounding box arguments. Alternatively x1,y2,x2,y2,x3,y3,x4,x4 for rotated bounding box.
            - split (Optional): data split, e.g. train, test, etc ...
            Alternatively, a list of filenames
            Alternatively, a filename of json coco format contains bounding box annotations
            Alternatively, a dictionry containing coco format annotations
    :param subset: (Optional) subset of images to analyze
    :param embeddings: (Optional) pre-calculated feature embeddings. Data type of np.ndarray of size n x d, n is the number of data points, d is the feature vector length.
        data type must be 'float32'.
    :param data_type: (Optional) data type, one of 'image', 'bbox'
    :param overwrite: (Optional) overwrite existing files
    :param print_summary: Print summary report of fastdup run results
    :param fastdup_kwargs: (Optional) fastdup run arguments, see fastdup.run() documentation
    """
    fastdup_capture_log_debug_state(locals())

    if fd._fastdup_applied and not overwrite:
        warnings.warn('Fastdup was already applied, use overwrite=True to re-run')
        return
    if annotations is not None:
        if isinstance(annotations, list):
            annotations = pd.DataFrame({'filename':annotations})
        elif isinstance(annotations, dict):
            assert isinstance(fd.input_dir, str), f"When working with COCO annotations need to provide fastdup.create(input_dur=...) with input_dir which is a single assolute path pointing to root folder with all images, got {fd._input_dir}"
            annotations = convert_coco_dict_to_df(annotations, fd._input_dir)
        elif isinstance(annotations, str) or isinstance(annotations, pathlib.Path):
            if isinstance(annotations, str):
                annotations = shorten_path(annotations)
            assert os.path.isfile(annotations), f"Failed to find annotations file {annotations}"
            if annotations.endswith('.csv'):
                annotations = pd.read_csv(annotations)
            elif annotations.endswith('.json'):
                import json
                label = json.loads(open(annotations, 'r').read())
                annotations = convert_coco_dict_to_df(label, fd._input_dir)
            else:
                assert False, "Unknown annotation file format, should end with .csv or .json"


        assert isinstance(annotations, pd.DataFrame) and not annotations.empty and "filename" in annotations.columns, f"Got wrong annotation parameter, should be pd.DataFrame with the mandatory columns: filename {annotations}"
        first_filename = annotations['filename'].values[0]
    fd._init_run(input_dir, annotations, subset, embeddings, data_type, overwrite, fastdup_kwargs)

    # get user's fastdup kwargs or use default
    fastdup_kwargs = {} if fastdup_kwargs is None else fastdup_kwargs
    if fd._pre_calc_features is not None:
        fastdup_kwargs['run_mode'] = 2
        fastdup_kwargs['d'] = fd._embeddings_dim
    fastdup_kwargs = set_fastdup_kwargs(fastdup_kwargs)
    if 'run_stats' in fastdup_kwargs and not fastdup_kwargs['run_stats']:
        fd._run_stats = False

    os.makedirs(fd._work_dir, exist_ok=True)
    assert os.path.exists(fd._work_dir), "Failed to create folder " + str(fd._work_dir)

    if overwrite and os.path.isfile(os.path.join(fd._work_dir, 'atrain_features.dat.csv')):
        os.unlink(os.path.join(fd._work_dir, 'atrain_features.dat.csv'))
    # run fastdup - create embeddings
    
    fastdup.run(fd._set_fastdup_input(), work_dir=str(fd._work_dir), **fastdup_kwargs)

    #fastdup_convert_to_relpath(self._work_dir, self._filename_prefix)

    # post process - map fastdup-id to image (for bbox this is done in self._set_fastdup_input)
    if fd._dtype == FD.IMG or fd._run_mode == FD.MODE_CROP:
        fd._create_img_mapping()

    # expand annotation csv to include files that are not in annotation but is in subset
    fd._expand_annot_df()
    if fd._dtype != FD.BBOX:
        fd._index_annot_df()

    fd._save_artifacts(fastdup_kwargs)
    fd._fastdup_applied = True
    if print_summary:
        fd.summary()
    if print_vl_datasets_ref:
        fd.vl_datasets_ref_printout()

    return 0

run(
    input_dir=fd.input_dir,
    annotations=df,
    run_stats=True,
    run_advanced_stats=True,
    overwrite=True,
    verbose=True,
)

Finally, after all of this, I noticed my error in the verbose logs. This error was completely my fault, and I probably made very wrong / dumb assumptions. I know for sure that I made a dumb mistake... but I cannot help but to feel that this really cool utility library failed me in some ways.

Contact Information [Optional]

No response

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions