mindspore-lab · tonytonglt · Feb 8, 2024 · Feb 8, 2024 · Feb 8, 2024 · Feb 8, 2024
diff --git a/configs/layout/yolov8/README.md b/configs/layout/yolov8/README.md
@@ -160,7 +160,7 @@ python infer.py \
     --input_images_dir=/your_path_to/val \
     --layout_model_path=your_path_to/output.mindir \
     --layout_model_name_or_config=../../configs/layout/yolov8/yolov8n.yaml \
-    --layout_save_dir=results_dir
+    --res_save_dir=results_dir
 ```
 
 ## 6. Visualization

diff --git a/configs/layout/yolov8/README_CN.md b/configs/layout/yolov8/README_CN.md
@@ -173,7 +173,7 @@ python infer.py \
     --input_images_dir=/your_path_to/val \
     --layout_model_path=your_path_to/output.mindir \
     --layout_model_name_or_config=../../configs/layout/yolov8/yolov8n.yaml \
-    --layout_save_dir=results_dir
+    --res_save_dir=results_dir
 ```
 
 ## 6. 可视化

diff --git a/mindocr/data/builder.py b/mindocr/data/builder.py
@@ -266,8 +266,13 @@ def _parse_minddata_op(dataset_args):
             minddata_op_list.append(color_adjust_op)
             continue
         if "NormalizeImage" in transform_dict.keys():
+            from mindocr.data.transforms.general_transforms import get_value
+
+            normalize_transform = transform_dict["NormalizeImage"]
+            mean = get_value(normalize_transform.get("mean", "imagenet"), "mean")
+            std = get_value(normalize_transform.get("std", "imagenet"), "std")
             minddata_op_idx.append(i)
-            normalize_op = ms.dataset.vision.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)
+            normalize_op = ms.dataset.vision.Normalize(mean=mean, std=std)
             minddata_op_list.append(normalize_op)
             continue
         if "ToCHWImage" in transform_dict.keys():

diff --git a/mindocr/data/det_dataset.py b/mindocr/data/det_dataset.py
@@ -75,7 +75,7 @@ def __init__(
 
         # create transform
         if transform_pipeline is not None:
-            global_config = dict(is_train=is_train)
+            global_config = dict(is_train=is_train, use_minddata=kwargs.get("use_minddata", False))
             self.transforms = create_transforms(transform_pipeline, global_config)
         else:
             raise ValueError("No transform pipeline is specified!")

diff --git a/mindocr/data/layout_dataset.py b/mindocr/data/layout_dataset.py
@@ -134,12 +134,15 @@ def __init__(
         self.img_shapes = np.array(shapes, dtype=np.float64)
         self.img_files = list(cache.keys())  # update
         if not is_train:
-            with open(annotations_path, "r") as f:
-                data = json.load(f)
-            file_id_dict = dict()
-            for item in data["images"]:
-                file_id_dict[item["file_name"]] = item["id"]
-            self.image_ids = [file_id_dict[img_file.split("/")[-1]] for img_file in self.img_files]
+            if os.path.isfile(annotations_path):
+                with open(annotations_path, "r") as f:
+                    data = json.load(f)
+                file_id_dict = dict()
+                for item in data["images"]:
+                    file_id_dict[item["file_name"]] = item["id"]
+                self.image_ids = [file_id_dict[img_file.split("/")[-1]] for img_file in self.img_files]
+            else:
+                self.image_ids = self.img_files
         else:
             self.image_ids = None
         self.label_files = self._img2label_paths(cache.keys())  # update

diff --git a/mindocr/data/transforms/general_transforms.py b/mindocr/data/transforms/general_transforms.py
@@ -1,12 +1,11 @@
-import random
 from typing import List, Union
 
 import cv2
 import numpy as np
 from PIL import Image
 
-from mindspore.dataset.vision import RandomColorAdjust as MSRandomColorAdjust
-from mindspore.dataset.vision import ToPIL
+from mindspore import dataset as ds
+from mindspore.dataset import vision
 
 from ...data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
@@ -22,6 +21,16 @@
 ]
 
 
+def get_value(val, name):
+    if isinstance(val, str) and val.lower() == "imagenet":
+        assert name in ["mean", "std"]
+        return IMAGENET_DEFAULT_MEAN if name == "mean" else IMAGENET_DEFAULT_STD
+    elif isinstance(val, list):
+        return val
+    else:
+        raise ValueError(f"Wrong {name} value: {val}")
+
+
 class DecodeImage:
     """
     img_mode (str): The channel order of the output, 'BGR' and 'RGB'. Default to 'BGR'.
@@ -37,17 +46,31 @@ def __init__(
         self.flag = cv2.IMREAD_IGNORE_ORIENTATION | cv2.IMREAD_COLOR if ignore_orientation else cv2.IMREAD_COLOR
         self.keep_ori = keep_ori
 
+        self.use_minddata = kwargs.get("use_minddata", False)
+        self.decoder = None
+        self.cvt_color = None
+        if self.use_minddata:
+            self.decoder = vision.Decoder()
+            self.cvt_color = vision.ConvertColor(vision.ConvertMode.COLOR_BGR2RGB)
+
     def __call__(self, data):
         if "img_path" in data:
             with open(data["img_path"], "rb") as f:
                 img = f.read()
         elif "img_lmdb" in data:
             img = data["img_lmdb"]
+        else:
+            raise ValueError('"img_path" or "img_lmdb" must be in input data')
         img = np.frombuffer(img, dtype="uint8")
-        img = cv2.imdecode(img, self.flag)
 
-        if self.img_mode == "RGB":
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        if self.use_minddata:
+            img = self.decoder(img)
+            if self.img_mode == "BGR":
+                img = self.cvt_color(img)
+        else:
+            img = cv2.imdecode(img, self.flag)
+            if self.img_mode == "RGB":
+                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 
         if self.channel_first:
             img = img.transpose((2, 0, 1))
@@ -86,16 +109,33 @@ def __init__(
 
         # TODO: detect hwc or chw automatically
         shape = (3, 1, 1) if not is_hwc else (1, 1, 3)
-        self.mean = np.array(self._get_value(mean, "mean")).reshape(shape).astype("float32")
-        self.std = np.array(self._get_value(std, "std")).reshape(shape).astype("float32")
+        self.mean = get_value(mean, "mean")
+        self.std = get_value(std, "std")
         self.is_hwc = is_hwc
 
+        self.use_minddata = kwargs.get("use_minddata", False)
+        self.normalize = None
+        self.cvt_color = None
+        if self.use_minddata:
+            self.decoder = vision.Normalize(self.mean, self.std, is_hwc)
+            self.cvt_color = vision.ConvertColor(vision.ConvertMode.COLOR_BGR2RGB)
+        else:
+            self.mean = np.array(self.mean).reshape(shape).astype("float32")
+            self.std = np.array(self.std).reshape(shape).astype("float32")
+
     def __call__(self, data):
         img = data["image"]
         if isinstance(img, Image.Image):
             img = np.array(img)
         assert isinstance(img, np.ndarray), "invalid input 'img' in NormalizeImage"
 
+        if self.use_minddata:
+            if self._channel_conversion:
+                img = self.cvt_color(img)
+            img = self.normalize(img)
+            data["image"] = img
+            return data
+
         if self._channel_conversion:
             if self.is_hwc:
                 img = img[..., [2, 1, 0]]
@@ -105,26 +145,22 @@ def __call__(self, data):
         data["image"] = (img.astype("float32") - self.mean) / self.std
         return data
 
-    @staticmethod
-    def _get_value(val, name):
-        if isinstance(val, str) and val.lower() == "imagenet":
-            assert name in ["mean", "std"]
-            return IMAGENET_DEFAULT_MEAN if name == "mean" else IMAGENET_DEFAULT_STD
-        elif isinstance(val, list):
-            return val
-        else:
-            raise ValueError(f"Wrong {name} value: {val}")
-
 
 class ToCHWImage:
     # convert hwc image to chw image
     def __init__(self, **kwargs):
-        pass
+        self.use_minddata = kwargs.get("use_minddata", False)
+        self.hwc2chw = None
+        if self.use_minddata:
+            self.hwc2chw = vision.HWC2CHW()
 
     def __call__(self, data):
         img = data["image"]
         if isinstance(img, Image.Image):
             img = np.array(img)
+        if self.use_minddata:
+            data["image"] = self.hwc2chw(img)
+            return data
         data["image"] = img.transpose((2, 0, 1))
         return data
 
@@ -181,7 +217,7 @@ def __call__(self, data: dict) -> dict:
             image
             (polys)
         """
-        if random.random() < self._p:
+        if np.random.random() < self._p:
             if self._size_limits:
                 size = data["image"].shape[:2]
                 min_scale = max(self._size_limits[0] / size[0], self._size_limits[0] / size[1], self._range[0])
@@ -201,16 +237,18 @@ class RandomColorAdjust:
     def __init__(self, brightness=32.0 / 255, saturation=0.5, **kwargs):
         contrast = kwargs.get("contrast", (1, 1))
         hue = kwargs.get("hue", (0, 0))
-        self._jitter = MSRandomColorAdjust(brightness=brightness, saturation=saturation, contrast=contrast, hue=hue)
-        self._pil = ToPIL()
+        self._jitter = vision.RandomColorAdjust(
+            brightness=brightness, saturation=saturation, contrast=contrast, hue=hue
+        )
+        self._jitter.implementation = ds.Implementation.C
 
     def __call__(self, data):
         """
         required keys: image
         modified keys: image
         """
         # there's a bug in MindSpore that requires images to be converted to the PIL format first
-        data["image"] = np.array(self._jitter(self._pil(data["image"])))
+        data["image"] = self._jitter(data["image"])
         return data
 
 
@@ -230,8 +268,8 @@ def __init__(self, degrees=(-10, 10), expand_canvas=True, p: float = 1.0, **kwar
         self._p = p
 
     def __call__(self, data: dict) -> dict:
-        if random.random() < self._p:
-            angle = random.randint(self._degrees[0], self._degrees[1])
+        if np.random.random() < self._p:
+            angle = np.random.randint(self._degrees[0], self._degrees[1])
             h, w = data["image"].shape[:2]
 
             center = w // 2, h // 2  # x, y
@@ -265,7 +303,7 @@ def __init__(self, p: float = 0.5, **kwargs):
         self._p = p
 
     def __call__(self, data: dict) -> dict:
-        if random.random() < self._p:
+        if np.random.random() < self._p:
             data["image"] = cv2.flip(data["image"], 1)
 
             if "polys" in data and len(data["polys"]):