mindspore-lab · SamitHuang · Mar 3, 2023 · Feb 7, 2023 · Feb 7, 2023 · Feb 7, 2023
diff --git a/configs/rec/r34_bilstm_ctc.yaml b/configs/rec/r34_bilstm_ctc.yaml
@@ -0,0 +1,142 @@
+system:
+  mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
+  distribute: False 
+  amp_level: 'O3'
+  seed: 42
+  val_while_train: True
+  ckpt_save_dir: './tmp_rec'
+
+common:
+  character_dict_path: &character_dict_path  #mindocr/utils/dict/en_dict.txt
+  num_classes: &num_classes 37 # num_chars_in_dict+1,  TODO: retreive it from dict or check correctness
+  max_text_len: &max_text_len 24
+  infer_mode: &infer_mode False
+  use_space_char: &use_space_char False
+  batch_size: &batch_size 64
+
+model:
+  type: rec
+  transform: null
+  backbone:
+    name: rec_resnet34
+    pretrained: False
+  neck:
+    name: RNNEncoder
+    hidden_size: 256 
+  head:
+    name: CTCHead 
+    weight_init: crnn_customised
+    bias_init: crnn_customised
+    out_channels: *num_classes 
+
+postprocess:
+  name: RecCTCLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+
+metric:
+  name: RecMetric
+  main_indicator: acc
+  character_dict_path: *character_dict_path
+  ignore_space: True 
+
+loss:
+  name: CTCLoss 
+  pred_seq_len: 25 # TODO: retrieve from the network output shape.
+  max_label_len: *max_text_len  # this value should be smaller than pre_seq_len
+  batch_size: *batch_size
+
+scheduler: 
+  scheduler: warmup_cosine_decay
+  min_lr: 0.0
+  lr: 0.0005
+  num_epochs: 40
+  warmup_epochs: 1
+  # warmup_steps: 500
+  decay_epochs: 39
+
+optimizer:
+  opt: adamw
+  filter_bias_and_bn: True
+  momentum: 0.95
+  weight_decay: 0.0001
+  loss_scale: 512
+  nesterov: False
+  #use_nesterov: True 
+
+train:
+  dataset_sink_mode: False
+  dataset:
+    type: LMDBDataset
+    data_dir: /old/katekong/crnn/datasets/ocr-datasets/data_lmdb_release/training/MJ/MJ_train/
+    # label_files: /data/ocr_datasets/ic15/word_recognition/rec_gt_train.txt
+    sample_ratios: [1.0]
+    shuffle: True
+    transform_pipeline:
+      - DecodeImage: 
+          img_mode: BGR
+          to_float32: False
+      - RecCTCLabelEncode:
+          max_text_len: *max_text_len 
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+      - RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize. 
+          image_shape: [32, 100] # H, W
+          infer_mode: *infer_mode
+          character_dict_path: *character_dict_path
+          padding: True # aspect ratio will be preserved if true.
+      - NormalizeImage:  # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec. 
+          bgr_to_rgb: True
+          is_hwc: True
+          mean : [127.0, 127.0, 127.0] 
+          std : [127.0, 127.0, 127.0]
+      - ToCHWImage: 
+    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize 
+    output_keys: ['image', 'text_seq'] #, 'length'] #'img_path'] 
+    num_keys_to_net: 1 # num inputs for network forward func in output_keys
+    #keys_for_loss: 4 # num labels for loss func 
+
+  loader:
+      shuffle: True # TODO: tbc
+      batch_size: *batch_size
+      drop_remainder: True
+      max_rowsize: 12
+      num_workers: 8
+
+eval:
+  dataset_sink_mode: False
+  dataset:
+    type: LMDBDataset
+    data_dir: /old/katekong/crnn/datasets/ocr-datasets/validation/
+    # label_files: /data/ocr_datasets/ic15/word_recognition/rec_gt_train.txt
+    sample_ratios: [1.0]
+    shuffle: False 
+    transform_pipeline:
+      - DecodeImage: 
+          img_mode: BGR
+          to_float32: False
+      - RecCTCLabelEncode:
+          max_text_len: *max_text_len 
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+      - RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize. 
+          image_shape: [32, 100] # H, W
+          infer_mode: *infer_mode
+          character_dict_path: *character_dict_path
+          padding: True # aspect ratio will be preserved if true.
+      - NormalizeImage:  # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec. 
+          bgr_to_rgb: True
+          is_hwc: True
+          mean : [127.0, 127.0, 127.0] 
+          std : [127.0, 127.0, 127.0]
+      - ToCHWImage: 
+    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize 
+    output_keys: ['image', 'text_padded', 'text_length']  # TODO return text string padding w/ fixed length, and a scaler to indicate the length 
+    num_keys_to_net: 1 # num inputs for network forward func
+
+  loader:
+      shuffle: False # TODO: tbc
+      batch_size: 64
+      drop_remainder: True
+      max_rowsize: 12
+      num_workers: 8
diff --git a/configs/rec/vgg7_bilstm_ctc.yaml b/configs/rec/vgg7_bilstm_ctc.yaml
@@ -0,0 +1,141 @@
+system:
+  mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
+  distribute: False 
+  amp_level: 'O3'
+  seed: 42
+  val_while_train: True
+  ckpt_save_dir: './tmp_rec'
+
+common:
+  character_dict_path: &character_dict_path  #mindocr/utils/dict/en_dict.txt
+  num_classes: &num_classes 37 # num_chars_in_dict+1,  TODO: retreive it from dict or check correctness
+  max_text_len: &max_text_len 23
+  infer_mode: &infer_mode False
+  use_space_char: &use_space_char False
+  batch_size: &batch_size 16
+
+model:
+  type: rec
+  transform: null
+  backbone:
+    name: rec_vgg7
+    pretrained: False
+  neck:
+    name: RNNEncoder
+    hidden_size: 256 
+  head:
+    name: CTCHead 
+    weight_init: crnn_customised
+    bias_init: crnn_customised
+    out_channels: *num_classes 
+
+postprocess:
+  name: RecCTCLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+
+metric:
+  name: RecMetric
+  main_indicator: acc
+  character_dict_path: *character_dict_path
+  ignore_space: True 
+
+loss:
+  name: CTCLoss 
+  pred_seq_len: 24 # TODO: retrieve from the network output shape.
+  max_label_len: *max_text_len  # this value should be smaller than pre_seq_len
+  batch_size: *batch_size
+
+scheduler: 
+  scheduler: warmup_cosine_decay
+  min_lr: 0.0
+  lr: 0.0005
+  num_epochs: 10
+  warmup_epochs: 0
+  decay_epochs: 10
+
+optimizer:
+  opt: adamw
+  filter_bias_and_bn: True
+  momentum: 0.95
+  weight_decay: 0.0001
+  loss_scale: 1024
+  nesterov: False
+  #use_nesterov: True 
+
+train:
+  dataset_sink_mode: False
+  dataset:
+    type: LMDBDataset
+    data_dir: /old/katekong/crnn/datasets/ocr-datasets/data_lmdb_release/training/MJ/MJ_train/
+    # label_files: /data/ocr_datasets/ic15/word_recognition/rec_gt_train.txt
+    sample_ratios: [1.0]
+    shuffle: True
+    transform_pipeline:
+      - DecodeImage: 
+          img_mode: BGR
+          to_float32: False
+      - RecCTCLabelEncode:
+          max_text_len: *max_text_len 
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+      - RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize. 
+          image_shape: [32, 100] # H, W
+          infer_mode: *infer_mode
+          character_dict_path: *character_dict_path
+          padding: True # aspect ratio will be preserved if true.
+      - NormalizeImage:  # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec. 
+          bgr_to_rgb: True
+          is_hwc: True
+          mean : [127.0, 127.0, 127.0] 
+          std : [127.0, 127.0, 127.0]
+      - ToCHWImage: 
+    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize 
+    output_keys: ['image', 'text_seq'] #, 'length'] #'img_path'] 
+    num_keys_to_net: 1 # num inputs for network forward func in output_keys
+    #keys_for_loss: 4 # num labels for loss func 
+
+  loader:
+      shuffle: True # TODO: tbc
+      batch_size: *batch_size
+      drop_remainder: True
+      max_rowsize: 12
+      num_workers: 8
+
+eval:
+  dataset_sink_mode: False
+  dataset:
+    type: LMDBDataset
+    data_dir: /old/katekong/crnn/datasets/ocr-datasets/validation/
+    # label_files: /data/ocr_datasets/ic15/word_recognition/rec_gt_train.txt
+    sample_ratios: [1.0]
+    shuffle: False 
+    transform_pipeline:
+      - DecodeImage: 
+          img_mode: BGR
+          to_float32: False
+      - RecCTCLabelEncode:
+          max_text_len: *max_text_len 
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+      - RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize. 
+          image_shape: [32, 100] # H, W
+          infer_mode: *infer_mode
+          character_dict_path: *character_dict_path
+          padding: True # aspect ratio will be preserved if true.
+      - NormalizeImage:  # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec. 
+          bgr_to_rgb: True
+          is_hwc: True
+          mean : [127.0, 127.0, 127.0] 
+          std : [127.0, 127.0, 127.0]
+      - ToCHWImage: 
+    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize 
+    output_keys: ['image', 'text_padded', 'text_length']  # TODO return text string padding w/ fixed length, and a scaler to indicate the length 
+    num_keys_to_net: 1 # num inputs for network forward func
+
+  loader:
+      shuffle: False # TODO: tbc
+      batch_size: 64
+      drop_remainder: True
+      max_rowsize: 12
+      num_workers: 8
diff --git a/mindocr/data/builder.py b/mindocr/data/builder.py
@@ -3,8 +3,9 @@
 from addict import Dict
 from .det_dataset import DetDataset
 from .rec_dataset import RecDataset
+from .rec_lmdb_dataset import LMDBDataset
 
-support_dataset_types = ['BaseDataset', 'DetDataset', 'RecDataset']
+support_dataset_types = ['BaseDataset', 'DetDataset', 'RecDataset', 'LMDBDataset']
 
 def build_dataset(dataset_config: dict,
                     loader_config: dict,