How to speed up "Generating train split" #6205
-
How to speed up "Generating train split". I used
Feature is the image of a data set, this is too slow |
Beta Was this translation helpful? Give feedback.
Replies: 8 comments 2 replies
-
To parallelize the loading, the |
Beta Was this translation helpful? Give feedback.
-
@mariosasko I passed in a path list, but now the progress is not shown: |
Beta Was this translation helpful? Give feedback.
-
import datasets
import os
from PIL import Image
import json
import torch
import cv2
import numpy as np
class ImagesConfig(datasets.BuilderConfig):
def __init__(self, **kwargs):
super(ImagesConfig, self).__init__(**kwargs)
class Images(datasets.GeneratorBasedBuilder):
def __init__(self, **kwargs):
self.DEFAULT_WRITER_BATCH_SIZE = 100
super(Images, self).__init__(**kwargs)
def _split_generators(self, dl_manager: datasets.DownloadManager):
meta_data = {}
with open(os.path.join(self.config.data_dir, "meta_data.json"), "r") as f:
meta_data = json.load(f)
data = []
if (
self.config.name == "similar_pairs"
):
for image1_path in meta_data:
for image2_path, similarity in meta_data[image1_path]["similar_images"]:
data.append(
(
image1_path,
image2_path,
similarity,
)
)
elif self.config.name == "image_prompt_pairs":
for image_path in meta_data:
data.append(image_path, meta_data[image_path]["prompt"])
print("data size:", len(data))
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"split": datasets.Split.TRAIN, "data": data},
)
]
BUILDER_CONFIGS = [
ImagesConfig(
name="similar_pairs",
description="simliar pair dataset,item is a pair of similar images",
),
ImagesConfig(
name="image_prompt_pairs",
description="image prompt pairs",
),
]
def _info(self):
if self.config.name == "similar_pairs":
return datasets.DatasetInfo(
features=datasets.Features(
{
"image1": datasets.features.Image(),
"image1_path": datasets.Value("string"),
"image2": datasets.features.Image(),
"image2_path": datasets.Value("string"),
"similarity": datasets.Value("float32"),
}
)
)
elif self.config.name == "image_prompt_pairs":
return datasets.DatasetInfo(
features=datasets.Features(
{
"image": datasets.features.Image(),
"image_path": datasets.features.Value("string"),
"prompt": datasets.Value("string"),
}
)
)
def _generate_examples(self, split, data):
if self.config.name == "similar_pairs":
for image1_path, image2_path, similarity in data:
yield image1_path + ":" + image2_path, {
"image1": Image.open(
os.path.join(self.config.data_dir, image1_path)
),
"image1_path": image1_path,
"image2": Image.open(
os.path.join(self.config.data_dir, image2_path)
),
"image2_path": image2_path,
"similarity": similarity,
} load dataset script from datasets import load_dataset
ds = load_dataset(
"/home/aihao/workspace/DeepLearningContent/datasets/images",
"similar_pairs",
"/home/aihao/workspace/DeepLearningContent/datasets/images",
split="train",
) |
Beta Was this translation helpful? Give feedback.
-
@mariosasko It will output "data size: 126454". But calling "Image.open" in |
Beta Was this translation helpful? Give feedback.
-
@mariosasko I tested it again on ubuntu, which turned out to be wsl. It's incredibly fast, but it seems to be slower to train. I'm confused |
Beta Was this translation helpful? Give feedback.
-
@mariosasko It's fine again now. It doesn't matter. thanks |
Beta Was this translation helpful? Give feedback.
-
@mariosasko Does it precompute an 400k image pairs in a few seconds? It's incredibly fast. The original code took me about 24 hours for a 100k image pairs |
Beta Was this translation helpful? Give feedback.
-
Hi @mariosasko and @aihao2000. I have looked through this solution and I am still confused of how to obtain an efficient solution for an online example. I have a minimal reproduceable example below, where I have modified the
The error
For some reason, this is slower than if i were to stream the entire dataset (all shards) an it defaults to |
Beta Was this translation helpful? Give feedback.
To parallelize the loading, the
gen_kwargs
requires a list that can be split intonum_proc
parts (shards), which are then passed to the generator (e.g., pass a list of image files or a list of directories (with the images) to parallelize over them)