Skip to content

add minddata functions #667

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/layout/yolov8/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ python infer.py \
--input_images_dir=/your_path_to/val \
--layout_model_path=your_path_to/output.mindir \
--layout_model_name_or_config=../../configs/layout/yolov8/yolov8n.yaml \
--layout_save_dir=results_dir
--res_save_dir=results_dir
```

## 6. Visualization
Expand Down
2 changes: 1 addition & 1 deletion configs/layout/yolov8/README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ python infer.py \
--input_images_dir=/your_path_to/val \
--layout_model_path=your_path_to/output.mindir \
--layout_model_name_or_config=../../configs/layout/yolov8/yolov8n.yaml \
--layout_save_dir=results_dir
--res_save_dir=results_dir
```

## 6. 可视化
Expand Down
7 changes: 6 additions & 1 deletion mindocr/data/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,13 @@ def _parse_minddata_op(dataset_args):
minddata_op_list.append(color_adjust_op)
continue
if "NormalizeImage" in transform_dict.keys():
from mindocr.data.transforms.general_transforms import get_value

normalize_transform = transform_dict["NormalizeImage"]
mean = get_value(normalize_transform.get("mean", "imagenet"), "mean")
std = get_value(normalize_transform.get("std", "imagenet"), "std")
minddata_op_idx.append(i)
normalize_op = ms.dataset.vision.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)
normalize_op = ms.dataset.vision.Normalize(mean=mean, std=std)
minddata_op_list.append(normalize_op)
continue
if "ToCHWImage" in transform_dict.keys():
Expand Down
2 changes: 1 addition & 1 deletion mindocr/data/det_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(

# create transform
if transform_pipeline is not None:
global_config = dict(is_train=is_train)
global_config = dict(is_train=is_train, use_minddata=kwargs.get("use_minddata", False))
self.transforms = create_transforms(transform_pipeline, global_config)
else:
raise ValueError("No transform pipeline is specified!")
Expand Down
15 changes: 9 additions & 6 deletions mindocr/data/layout_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,15 @@ def __init__(
self.img_shapes = np.array(shapes, dtype=np.float64)
self.img_files = list(cache.keys()) # update
if not is_train:
with open(annotations_path, "r") as f:
data = json.load(f)
file_id_dict = dict()
for item in data["images"]:
file_id_dict[item["file_name"]] = item["id"]
self.image_ids = [file_id_dict[img_file.split("/")[-1]] for img_file in self.img_files]
if os.path.isfile(annotations_path):
with open(annotations_path, "r") as f:
data = json.load(f)
file_id_dict = dict()
for item in data["images"]:
file_id_dict[item["file_name"]] = item["id"]
self.image_ids = [file_id_dict[img_file.split("/")[-1]] for img_file in self.img_files]
else:
self.image_ids = self.img_files
else:
self.image_ids = None
self.label_files = self._img2label_paths(cache.keys()) # update
Expand Down
90 changes: 64 additions & 26 deletions mindocr/data/transforms/general_transforms.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import random
from typing import List, Union

import cv2
import numpy as np
from PIL import Image

from mindspore.dataset.vision import RandomColorAdjust as MSRandomColorAdjust
from mindspore.dataset.vision import ToPIL
from mindspore import dataset as ds
from mindspore.dataset import vision

from ...data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

Expand All @@ -22,6 +21,16 @@
]


def get_value(val, name):
if isinstance(val, str) and val.lower() == "imagenet":
assert name in ["mean", "std"]
return IMAGENET_DEFAULT_MEAN if name == "mean" else IMAGENET_DEFAULT_STD
elif isinstance(val, list):
return val
else:
raise ValueError(f"Wrong {name} value: {val}")


class DecodeImage:
"""
img_mode (str): The channel order of the output, 'BGR' and 'RGB'. Default to 'BGR'.
Expand All @@ -37,17 +46,31 @@ def __init__(
self.flag = cv2.IMREAD_IGNORE_ORIENTATION | cv2.IMREAD_COLOR if ignore_orientation else cv2.IMREAD_COLOR
self.keep_ori = keep_ori

self.use_minddata = kwargs.get("use_minddata", False)
self.decoder = None
self.cvt_color = None
if self.use_minddata:
self.decoder = vision.Decoder()
self.cvt_color = vision.ConvertColor(vision.ConvertMode.COLOR_BGR2RGB)

def __call__(self, data):
if "img_path" in data:
with open(data["img_path"], "rb") as f:
img = f.read()
elif "img_lmdb" in data:
img = data["img_lmdb"]
else:
raise ValueError('"img_path" or "img_lmdb" must be in input data')
img = np.frombuffer(img, dtype="uint8")
img = cv2.imdecode(img, self.flag)

if self.img_mode == "RGB":
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
if self.use_minddata:
img = self.decoder(img)
if self.img_mode == "BGR":
img = self.cvt_color(img)
else:
img = cv2.imdecode(img, self.flag)
if self.img_mode == "RGB":
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

if self.channel_first:
img = img.transpose((2, 0, 1))
Expand Down Expand Up @@ -86,16 +109,33 @@ def __init__(

# TODO: detect hwc or chw automatically
shape = (3, 1, 1) if not is_hwc else (1, 1, 3)
self.mean = np.array(self._get_value(mean, "mean")).reshape(shape).astype("float32")
self.std = np.array(self._get_value(std, "std")).reshape(shape).astype("float32")
self.mean = get_value(mean, "mean")
self.std = get_value(std, "std")
self.is_hwc = is_hwc

self.use_minddata = kwargs.get("use_minddata", False)
self.normalize = None
self.cvt_color = None
if self.use_minddata:
self.decoder = vision.Normalize(self.mean, self.std, is_hwc)
self.cvt_color = vision.ConvertColor(vision.ConvertMode.COLOR_BGR2RGB)
else:
self.mean = np.array(self.mean).reshape(shape).astype("float32")
self.std = np.array(self.std).reshape(shape).astype("float32")

def __call__(self, data):
img = data["image"]
if isinstance(img, Image.Image):
img = np.array(img)
assert isinstance(img, np.ndarray), "invalid input 'img' in NormalizeImage"

if self.use_minddata:
if self._channel_conversion:
img = self.cvt_color(img)
img = self.normalize(img)
data["image"] = img
return data

if self._channel_conversion:
if self.is_hwc:
img = img[..., [2, 1, 0]]
Expand All @@ -105,26 +145,22 @@ def __call__(self, data):
data["image"] = (img.astype("float32") - self.mean) / self.std
return data

@staticmethod
def _get_value(val, name):
if isinstance(val, str) and val.lower() == "imagenet":
assert name in ["mean", "std"]
return IMAGENET_DEFAULT_MEAN if name == "mean" else IMAGENET_DEFAULT_STD
elif isinstance(val, list):
return val
else:
raise ValueError(f"Wrong {name} value: {val}")


class ToCHWImage:
# convert hwc image to chw image
def __init__(self, **kwargs):
pass
self.use_minddata = kwargs.get("use_minddata", False)
self.hwc2chw = None
if self.use_minddata:
self.hwc2chw = vision.HWC2CHW()

def __call__(self, data):
img = data["image"]
if isinstance(img, Image.Image):
img = np.array(img)
if self.use_minddata:
data["image"] = self.hwc2chw(img)
return data
data["image"] = img.transpose((2, 0, 1))
return data

Expand Down Expand Up @@ -181,7 +217,7 @@ def __call__(self, data: dict) -> dict:
image
(polys)
"""
if random.random() < self._p:
if np.random.random() < self._p:
if self._size_limits:
size = data["image"].shape[:2]
min_scale = max(self._size_limits[0] / size[0], self._size_limits[0] / size[1], self._range[0])
Expand All @@ -201,16 +237,18 @@ class RandomColorAdjust:
def __init__(self, brightness=32.0 / 255, saturation=0.5, **kwargs):
contrast = kwargs.get("contrast", (1, 1))
hue = kwargs.get("hue", (0, 0))
self._jitter = MSRandomColorAdjust(brightness=brightness, saturation=saturation, contrast=contrast, hue=hue)
self._pil = ToPIL()
self._jitter = vision.RandomColorAdjust(
brightness=brightness, saturation=saturation, contrast=contrast, hue=hue
)
self._jitter.implementation = ds.Implementation.C

def __call__(self, data):
"""
required keys: image
modified keys: image
"""
# there's a bug in MindSpore that requires images to be converted to the PIL format first
data["image"] = np.array(self._jitter(self._pil(data["image"])))
data["image"] = self._jitter(data["image"])
return data


Expand All @@ -230,8 +268,8 @@ def __init__(self, degrees=(-10, 10), expand_canvas=True, p: float = 1.0, **kwar
self._p = p

def __call__(self, data: dict) -> dict:
if random.random() < self._p:
angle = random.randint(self._degrees[0], self._degrees[1])
if np.random.random() < self._p:
angle = np.random.randint(self._degrees[0], self._degrees[1])
h, w = data["image"].shape[:2]

center = w // 2, h // 2 # x, y
Expand Down Expand Up @@ -265,7 +303,7 @@ def __init__(self, p: float = 0.5, **kwargs):
self._p = p

def __call__(self, data: dict) -> dict:
if random.random() < self._p:
if np.random.random() < self._p:
data["image"] = cv2.flip(data["image"], 1)

if "polys" in data and len(data["polys"]):
Expand Down