Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/luxonis dataset integration #50

Merged
merged 24 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7b47511
feature: add dataset convertets (coco, yolo, ldf)
sokovninn Apr 7, 2024
ec191ba
feature: add dataset utils
sokovninn Apr 7, 2024
e50f250
feature: add raw dataset merge
sokovninn Apr 7, 2024
5696944
docs: update examples
sokovninn Apr 7, 2024
3f3883d
feature: add LuxonisDataset, COCO, YOLO formats
sokovninn Apr 7, 2024
c7ec6ef
fix: remove old yolo conversion script
sokovninn Apr 7, 2024
16c92ef
format: fix formatting
sokovninn Apr 7, 2024
e3140b7
chore: update .gitignore
sokovninn Apr 7, 2024
996a445
[Automated] Updated coverage badge
actions-user Apr 7, 2024
a1f11d3
fix: import from utils
sokovninn Apr 8, 2024
68e9f79
fix: not removing dir when --annotate_only
sokovninn Apr 8, 2024
4dc3bca
[Automated] Updated coverage badge
actions-user Apr 8, 2024
d79968e
docs: fix docstrings in converters
sokovninn Apr 8, 2024
bde5fec
refactor: remove file with a typo in name
sokovninn Apr 8, 2024
9ee7ef2
chore: set the minimum required version of luxonis-ml
sokovninn Apr 8, 2024
e74f205
fix: remove redundant function
sokovninn Apr 8, 2024
b3d2f34
refactor: rename ldf to luxonis-dataset
sokovninn Apr 8, 2024
00962b2
Merge branch 'feature/luxonis-dataset-integration' of https://github.…
sokovninn Apr 8, 2024
140ecf3
format: black and ruff
sokovninn Apr 8, 2024
e376152
[Automated] Updated coverage badge
actions-user Apr 8, 2024
3adf29e
feature: add reproducibility with a random seed to converters
sokovninn Apr 9, 2024
ce685a7
test: add converter tests
sokovninn Apr 9, 2024
896af05
format: black
sokovninn Apr 9, 2024
b147b61
[Automated] Updated coverage badge
actions-user Apr 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -153,5 +153,6 @@ Thumbs.db
# Others
node_modules/
**generated_dataset*/
**gen_dataset*/
**runs/
**wandb/
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ datadreamer --save_dir <directory> --class_names <objects> --prompts_number <num
### 🔧 Additional Parameters

- `--task`: Choose between detection and classification. Default is `detection`.
- `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `luxonis-dataset`.
- `--split_ratios`: Split ratios for train, validation, and test sets. Defaults to `[0.8, 0.1, 0.1]`.
- `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.
- `--prompt_generator`: Choose between `simple`, `lm` (language model) and `tiny` (tiny LM). Default is `simple`.
- `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.
Expand Down
143 changes: 91 additions & 52 deletions datadreamer/pipelines/generate_dataset_from_scratch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import argparse
import json
import os
import shutil
import uuid

import matplotlib.patches as patches
import matplotlib.pyplot as plt
Expand All @@ -24,6 +26,8 @@
TinyLlamaLMPromptGenerator,
WordNetSynonymGenerator,
)
from datadreamer.utils import convert_dataset
from datadreamer.utils.dataset_utils import save_annotations_to_json

prompt_generators = {
"simple": SimplePromptGenerator,
Expand Down Expand Up @@ -112,6 +116,21 @@ def parse_args():
help="Image annotator to use",
)

parser.add_argument(
"--dataset_format",
type=str,
default="raw",
choices=["raw", "yolo", "coco", "luxonis-dataset"],
help="Dataset format to use",
)
parser.add_argument(
"--split_ratios",
type=float,
nargs="+",
default=[0.8, 0.1, 0.1],
help="Train-validation-test split ratios (default: 0.8, 0.1, 0.1).",
)

parser.add_argument(
"--synonym_generator",
type=str,
Expand Down Expand Up @@ -319,57 +338,38 @@ def check_args(args):
"--image_annotator must be one of the available annotators for classification task"
)

# Check coorect task and dataset_format
if args.task == "classification" and args.dataset_format in ["coco", "yolo"]:
raise ValueError(
"--dataset_format must be one of the available dataset formats for classification task"
)

def save_det_annotations_to_json(
image_paths,
boxes_list,
labels_list,
class_names,
save_dir,
file_name="annotations.json",
):
annotations = {}
for image_path, bboxes, labels in zip(image_paths, boxes_list, labels_list):
image_name = os.path.basename(image_path)
annotations[image_name] = {
"boxes": bboxes.tolist(),
"labels": labels.tolist(),
}
annotations["class_names"] = class_names

# Save to JSON file
with open(os.path.join(save_dir, file_name), "w") as f:
json.dump(annotations, f, indent=4)


def save_clf_annotations_to_json(
image_paths, labels_list, class_names, save_dir, file_name="annotations.json"
):
annotations = {}
for image_path, labels in zip(image_paths, labels_list):
image_name = os.path.basename(image_path)
annotations[image_name] = {
"labels": labels.tolist(),
}
annotations["class_names"] = class_names

# Save to JSON file
with open(os.path.join(save_dir, file_name), "w") as f:
json.dump(annotations, f, indent=4)
# Check split_ratios
if (
len(args.split_ratios) != 3
or not all(0 <= ratio <= 1 for ratio in args.split_ratios)
or sum(args.split_ratios) != 1
):
raise ValueError(
"--split_ratios must be a list of three floats that sum up to 1"
)


def main():
args = parse_args()
check_args(args)

# Directories for saving images and bboxes
save_dir = args.save_dir
if not args.annotate_only:
if os.path.exists(save_dir):
shutil.rmtree(save_dir)
os.makedirs(save_dir)

# Directories for saving images and bboxes
bbox_dir = os.path.join(save_dir, "bboxes_visualization")
if not os.path.exists(save_dir):
os.makedirs(save_dir)
if not os.path.exists(bbox_dir):
os.makedirs(bbox_dir)
if os.path.exists(bbox_dir):
shutil.rmtree(bbox_dir)
os.makedirs(bbox_dir)

# Save arguments
with open(os.path.join(save_dir, "generation_args.json"), "w") as f:
Expand Down Expand Up @@ -417,7 +417,9 @@ def main():
prompts, prompt_objects
):
for generated_image in generated_images_batch:
image_path = os.path.join(save_dir, f"image_{num_generated_images}.jpg")
unique_id = uuid.uuid4().hex
sokovninn marked this conversation as resolved.
Show resolved Hide resolved
unique_filename = f"image_{num_generated_images}_{unique_id}.jpg"
image_path = os.path.join(save_dir, unique_filename)
generated_image.save(image_path)
image_paths.append(image_path)
num_generated_images += 1
Expand All @@ -442,12 +444,15 @@ def main():
synonym_dict, os.path.join(save_dir, "synonyms.json")
)

boxes_list = []
scores_list = []
labels_list = []

if args.task == "classification":
# Classification annotation
annotator_class = clf_annotators[args.image_annotator]
annotator = annotator_class(device=args.device, size=args.annotator_size)

labels_list = []
# Split image_paths into batches
image_batches = [
image_paths[i : i + args.batch_size_annotation]
Expand All @@ -468,24 +473,22 @@ def main():
)
labels_list.extend(batch_labels)

save_clf_annotations_to_json(
image_paths, labels_list, args.class_names, save_dir
save_annotations_to_json(
image_paths=image_paths,
labels_list=labels_list,
class_names=args.class_names,
save_dir=save_dir,
)
else:
# Annotation
annotator_class = det_annotators[args.image_annotator]
annotator = annotator_class(device=args.device, size=args.annotator_size)

boxes_list = []
scores_list = []
labels_list = []

# Split image_paths into batches
image_batches = [
image_paths[i : i + args.batch_size_annotation]
for i in range(0, len(image_paths), args.batch_size_annotation)
]

for i, image_batch in tqdm(
enumerate(image_batches),
desc="Annotating images",
Expand Down Expand Up @@ -546,8 +549,44 @@ def main():
plt.close()

# Save annotations as JSON files
save_det_annotations_to_json(
image_paths, boxes_list, labels_list, args.class_names, save_dir
save_annotations_to_json(
image_paths=image_paths,
labels_list=labels_list,
boxes_list=boxes_list,
class_names=args.class_names,
save_dir=save_dir,
)

if args.dataset_format == "yolo":
# Convert annotations to YOLO format
convert_dataset.convert_dataset(
args.save_dir,
args.save_dir,
"yolo",
args.split_ratios,
copy_files=False,
seed=args.seed,
)
# Convert annotations to COCO format
elif args.dataset_format == "coco":
convert_dataset.convert_dataset(
args.save_dir,
args.save_dir,
"coco",
args.split_ratios,
copy_files=False,
seed=args.seed,
)

# Convert annotations to LuxonisDataset format
if args.dataset_format == "luxonis-dataset":
convert_dataset.convert_dataset(
args.save_dir,
args.save_dir,
"luxonis-dataset",
args.split_ratios,
copy_files=False,
seed=args.seed,
)


Expand Down
12 changes: 12 additions & 0 deletions datadreamer/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,13 @@
from __future__ import annotations

from .base_converter import BaseConverter
from .coco_converter import COCOConverter
from .luxonis_dataset_converter import LuxonisDatasetConverter
from .yolo_converter import YOLOConverter

__all__ = [
"BaseConverter",
"COCOConverter",
"LuxonisDatasetConverter",
"YOLOConverter",
]
69 changes: 69 additions & 0 deletions datadreamer/utils/base_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from __future__ import annotations

import json
from abc import ABC, abstractmethod

import numpy as np


class BaseConverter(ABC):
"""Abstract base class for converter."""

def __init__(self, seed=42):
np.random.seed(seed)

@abstractmethod
def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True):
"""Converts a dataset into another format.

Args:
- dataset_dir (str): The directory where the source dataset is located.
- output_dir (str): The directory where the processed dataset should be saved.
- split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
- copy_files (bool, optional): Whether to copy the source files to the output directory, otherwise move them. Defaults to True.


No return value.
"""
pass

@staticmethod
def read_annotations(annotation_path):
"""Reads annotations from a JSON file located at the specified path.

Args:
- annotation_path (str): The path to the JSON file containing annotations.

Returns:
- dict: A dictionary containing the data loaded from the JSON file.
"""
with open(annotation_path) as f:
data = json.load(f)
return data

@staticmethod
def make_splits(images, split_ratios, shuffle=True):
"""Splits the list of images into training, validation, and test sets.

Args:
- images (list of str): A list of image paths.
- split_ratios (list of float): The ratios to split the data into training, validation, and test sets.
- shuffle (bool, optional): Whether to shuffle the list of images. Defaults to True.

Returns:
- list of str: A list of image paths for the training set.
- list of str: A list of image paths for the validation set.
- list of str: A list of image paths for the test set.
"""
if shuffle:
np.random.shuffle(images)

train_images = images[: int(len(images) * split_ratios[0])]
val_images = images[
int(len(images) * split_ratios[0]) : int(
len(images) * (split_ratios[0] + split_ratios[1])
)
]
test_images = images[int(len(images) * (split_ratios[0] + split_ratios[1])) :]

return train_images, val_images, test_images
Loading