mindspore-lab · SamitHuang · Mar 13, 2023 · Mar 13, 2023 · Mar 13, 2023
diff --git a/mindocr/data/builder.py b/mindocr/data/builder.py
@@ -20,9 +20,11 @@ def build_dataset(
     Args:
         dataset_config (dict): dataset reading and processing configuartion containing keys:
             - type: dataset type, 'DetDataset', 'RecDataset'
-            - data_dir Union[str, List]: folder to the dataset.
-            - label_file (optional for recognition): file path(s) to the annotation file
-            - transform_pipeline (list[dict]): config dict for image and label transformation
+            - dataset_root (str): the root directory to store the (multiple) dataset(s)
+            - data_dir (Union[str, List[str]]): directory to the data, which is a subfolder path related to `dataset_root`. For multiple datasets, it is a list of subfolder paths.
+            - label_file (Union[str, List[str]]): file path to the annotation related to the `dataset_root`. For multiple datasets, it is a list of relative file paths.
+            - transform_pipeline (list[dict]): each element corresponds to a transform operation on image and/or label
+
         loader_config (dict): dataloader configuration containing keys:
             - batch_size: batch size for data loader
             - drop_remainder: whether to drop the data in the last batch when the total of data can not be divided by the batch_size
@@ -33,13 +35,11 @@ def build_dataset(
     Return:
         data_loader (Dataset): dataloader to generate data batch
     '''
-    # build datasets
-    dataset_class_name = dataset_config.pop('type')
-    assert dataset_class_name in supported_dataset_types, "Invalid dataset name"
-    ## convert data_dir and  to abs path. TODO: do it inside dataset class init?
+
+    ## check and process dataset_root, data_dir, and label_file.
     if 'dataset_root' in dataset_config:
         if isinstance(dataset_config['data_dir'], str):
-            dataset_config['data_dir'] = os.path.join(dataset_config['dataset_root'], dataset_config['data_dir'])
+            dataset_config['data_dir'] = os.path.join(dataset_config['dataset_root'], dataset_config['data_dir']) # to absolute path
         else:
             dataset_config['data_dir'] = [os.path.join(dataset_config['dataset_root'], dd) for dd in dataset_config['data_dir']]
 
@@ -49,11 +49,11 @@ def build_dataset(
             else:
                 dataset_config['label_file'] = [os.path.join(dataset_config['dataset_root'], lf) for lf in dataset_confg['label_file']]
 
-    # get dataset class
+    # build datasets
+    dataset_class_name = dataset_config.pop('type')
+    assert dataset_class_name in supported_dataset_types, "Invalid dataset name"
     dataset_class = eval(dataset_class_name)
 
-    #print('dataset config', dataset_config)
-
     dataset_args = dict(is_train=is_train, **dataset_config)
     dataset = dataset_class(**dataset_args)
 

diff --git a/mindocr/models/heads/rec_ctc_head.py b/mindocr/models/heads/rec_ctc_head.py
@@ -41,10 +41,10 @@ def __init__(self,
         self.out_channels = out_channels
         self.mid_channels = mid_channels
         self.return_feats = return_feats
-        
+
         if weight_init == "crnn_customised":
             weight_init = crnn_head_initialization(in_channels)
-            
+
         if bias_init == "crnn_customised":
             bias_init = crnn_head_initialization(in_channels)
 
@@ -75,7 +75,8 @@ def construct(self, x):
             h = self.dense2(h)
 
         if not self.training:
-            h = ops.softmax(h, axis=2)
+            #h = ops.softmax(h, axis=2) # not support on ms 1.8.1
+            h = ops.Softmax(axis=2)(h)
 
         pred = {'head_out': h}
         return pred

diff --git a/tools/train.py b/tools/train.py
@@ -161,6 +161,8 @@ def main(cfg):
             print(f'INFO: datasets found: {os.listdir(dataset_root)} \n'
                   f'INFO: dataset_root is changed to {dataset_root}'
                   )
+        # update dataset root dir to cache
+        assert 'dataset_root' in config['train']['dataset'], f'`dataset_root` must be provided in the yaml file for training on ModelArts or OpenI, but not found in {yaml_fp}. Please add `dataset_root` to `train:dataset` and `eval:dataset` in the yaml file'
         config.train.dataset.dataset_root = dataset_root
         config.eval.dataset.dataset_root = dataset_root