mindspore-lab · SamitHuang · Feb 24, 2023 · Feb 16, 2023 · Feb 17, 2023 · Feb 17, 2023
diff --git a/.gitignore b/.gitignore
@@ -3,8 +3,8 @@ __pycache__/
 *.py[cod]
 *$py.class
 
-.swp
-.swo
+*.swp
+*.swo
 
 # C extensions
 *.so

diff --git a/configs/det/db_r50_icdar15.yaml b/configs/det/db_r50_icdar15.yaml
@@ -1,87 +1,112 @@
+system:
+  mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
+  distribute: False 
+  amp_level: 'O0'
+  seed: 42
+
 model:
   type: det
   transform: null
   backbone:
     name: det_resnet50
     pretrained: False
   neck:
-    name: FPN
+    name: DBFPN
     out_channels: 256
-    #use_asf: True
+    bias: False
+    use_asf: False # enable it for DB++ 
   head:
-    name: ConvHead 
-    out_channels: 2
+    name: DBHead 
     k: 50
+    bias: False
+    adaptive: True
+    serial: False
+
+loss:
+  name: L1BalanceCELoss  
+  eps: 0.000001 
+  l1_scale: 10
+  bce_scale: 5 
+  bce_replace: bceloss
 
 scheduler: 
-  name: "cosine_decay"
-  min_lr: 0.0
-  lr: 0.1
-  warmup_epochs: 4
-  decay_epochs: 96
+  scheduler: "cosine_decay"
+  min_lr: 0.00001
+  lr: 0.001
+  num_epochs: 300
+  warmup_epochs: 0
+  decay_epochs: 280
 
 optimizer:
-  name: "momentum"
+  opt: "momentum"
   filter_bias_and_bn: True
   momentum: 0.9
   weight_decay: 0.0001
-  loss_scale: 1024
+  loss_scale: 1.0
   use_nesterov: False
 
 train:
+  dataset_sink_mode: False
   dataset:
     type: DetDataset
     data_dir: /data/ocr_datasets/ic15/text_localization/train
     label_files: /data/ocr_datasets/ic15/text_localization/train/train_icdar15_label.txt
+    #data_dir: /Users/Samit/Data/datasets/ic15/det/train
+    #label_files: /Users/Samit/Data/datasets/ic15/det/train/train_icdar2015_label.txt
     sample_ratios: [1.0]
-    shuffle: True,
+    shuffle: True
     transform_pipeline:
-              - DecodeImage: 
-                  img_mode: BGR, 
-                  to_float32: False
-              - DetLabelEncode:  
-              - MZResizeByGrid: 
-                  #denominator: 32
-                  divisor: 32
-                  transform_polys: True # originally in modelzoo, it doesn't transform polys
-              - MZRandomScaleByShortSide: 
-                  short_side: 736
-              - IaaAugment: 
-                  augmenter_args:
-                    - {'type': 'Affine', 'args': {'rotate': [-10, 10]}}
-                    - {'type': 'Fliplr', 'args': {'p': 0.5}}
-              - MZRandomCropData: 
-                  max_tries: 100 
-                  min_crop_side_ratio: 0.1
-                  crop_size: [640, 640]
-              - MZResizeByGrid: 
-                  #denominator: 32
-                  divisor: 32
-                  transform_polys: True
-              #- MakeShrinkMap: 
-              - MZMakeSegDetectionData: 
-                  min_text_size: 8
-                  shrink_ratio: 0.4
-              #- 'MakeBorderMap': 
-              - MZMakeBorderMap:
-                  shrink_ratio: 0.4
-                  thresh_min: 0.3
-                  thresh_max: 0.7
-              - MZRandomColorAdjust: 
-                  brightness: 32.0 / 255
-                  saturation: 0.5
-                  to_numpy: True
-              #{'MZIrregularNormToCHW': None},
-              - NormalizeImage: 
-                  bgr_to_rgb: True
-                  is_hwc: True
-                  mean : [123.675, 116.28, 103.53]
-                  std : [58.395, 57.12, 57.375]
-              - ToCHWImage: 
-    output_keys: ['img_path', 'image']
+      - DecodeImage: 
+          img_mode: BGR
+          to_float32: False
+      - DetLabelEncode:  
+      - MZResizeByGrid: 
+          #denominator: 32
+          divisor: 32
+          transform_polys: True # originally in modelzoo, it doesn't transform polys
+      - MZRandomScaleByShortSide: 
+          short_side: 736
+      - IaaAugment: 
+          augmenter_args:
+            - {'type': 'Affine', 'args': {'rotate': [-10, 10]}}
+            - {'type': 'Fliplr', 'args': {'p': 0.5}}
+      - MZRandomCropData: 
+          max_tries: 100 
+          min_crop_side_ratio: 0.1
+          crop_size: [640, 640]
+      - MZResizeByGrid: 
+          #denominator: 32
+          divisor: 32
+          transform_polys: True
+      #- MakeShrinkMap: 
+      - MZMakeSegDetectionData: 
+          min_text_size: 8
+          shrink_ratio: 0.4
+      #- 'MakeBorderMap': 
+      - MZMakeBorderMap:
+          shrink_ratio: 0.4
+          thresh_min: 0.3
+          thresh_max: 0.7
+      - MZRandomColorAdjust: 
+          brightness: 0.1255 #32.0 / 255
+          saturation: 0.5
+          to_numpy: True
+      #{'MZIrregularNormToCHW': None},
+      - NormalizeImage: 
+          bgr_to_rgb: True
+          is_hwc: True
+          mean : [123.675, 116.28, 103.53]
+          std : [58.395, 57.12, 57.375]
+      - ToCHWImage: 
+    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize 
+    output_keys: ['image', 'shrink_map', 'shrink_mask', 'threshold_map', 'threshold_mask'] #'img_path'] 
+    num_keys_to_net: 1 # num inputs for network forward func
+    #keys_for_loss: 4 # num labels for loss func
+
   loader:
       shuffle: True # TODO: tbc
-      batch_size: 8
+      batch_size: 16
       drop_remainder: True
       max_rowsize: 6
-      num_workers: 4
+      num_workers: 2
+
diff --git a/configs/det/db_r50_icdar15.yaml.bakup b/configs/det/db_r50_icdar15.yaml.bakup
@@ -0,0 +1,32 @@
+
+
+
+model:
+  type: det
+  transform: null
+  backbone:
+    name: det_resnet50
+    pretrained: False
+  neck:
+    name: FPN
+    out_channels: 256
+    #use_asf: True
+  head:
+    name: ConvHead 
+    out_channels: 2
+    k: 50
+
+scheduler: 
+  name: "cosine_decay"
+  min_lr: 0.0
+  lr: 0.1
+  warmup_epochs: 4
+  decay_epochs: 96
+
+optimizer:
+  name: "momentum"
+  filter_bias_and_bn: True
+  momentum: 0.9
+  weight_decay: 0.0001
+  loss_scale: 1024
+  use_nesterov: False
diff --git a/configs/rec/crnn_icdar15.yaml b/configs/rec/crnn_icdar15.yaml
@@ -0,0 +1,89 @@
+system:
+  mode: 1 # 0 for graph mode, 1 for pynative mode in MindSpore
+  distribute: False 
+  amp_level: 'O0'
+  seed: 42
+
+common:
+  character_dict_path: &character_dict_path  #mindocr/utils/dict/en_dict.txt
+  num_classes: &num_classes 37 # num_chars_in_dict+1,  TODO: retreive it from dict or check correctness
+  max_text_len: &max_text_len 23
+  infer_mode: &infer_mode False
+  use_space_char: &use_space_char False
+  batch_size: &batch_size 32
+
+model:
+  type: rec
+  transform: null
+  backbone:
+    name: rec_vgg7 #resnet34@mindcv
+    pretrained: False
+  neck:
+    name: RNNEncoder
+    hidden_size: 256 
+  head:
+    name: CTCHead 
+    out_channels: *num_classes 
+
+loss:
+  name: CTCLoss 
+  pred_seq_len: 24 # TODO: this should be retrieved from the network output shape.
+  max_label_len: *max_text_len  # this value should be smaller than pre_seq_len
+  batch_size: *batch_size
+
+scheduler: 
+  scheduler: "cosine_decay"
+  min_lr: 0.000001
+  lr: 0.001
+  num_epochs: 100
+  warmup_epochs: 10
+  decay_epochs: 90
+
+optimizer:
+  opt: "momentum"
+  filter_bias_and_bn: True
+  momentum: 0.9
+  weight_decay: 0.0001
+  loss_scale: 1.0
+  use_nesterov: True 
+
+train:
+  dataset_sink_mode: False
+  dataset:
+    type: RecDataset
+    #data_dir: /Users/Samit/Data/datasets/ic15/rec/ch4_training_word_images_gt
+    #label_files: /Users/Samit/Data/datasets/ic15/rec/rec_gt_train.txt
+    data_dir: /data/ocr_datasets/ic15/word_recognition/ch4_training_word_images_gt
+    label_files: /data/ocr_datasets/ic15/word_recognition/rec_gt_train.txt
+    sample_ratios: [1.0]
+    shuffle: True
+    transform_pipeline:
+      - DecodeImage: 
+          img_mode: BGR
+          to_float32: False
+      - RecCTCLabelEncode:
+          max_text_len: *max_text_len 
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+      - RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize. 
+          image_shape: [32, 100] # H, W
+          infer_mode: *infer_mode
+          character_dict_path: *character_dict_path
+          padding: True # aspect ratio will be preserved if true.
+      - NormalizeImage:  # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec. 
+          bgr_to_rgb: True
+          is_hwc: True
+          mean : [127.0, 127.0, 127.0] 
+          std : [127.0, 127.0, 127.0]
+      - ToCHWImage: 
+    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize 
+    output_keys: ['image', 'text_seq'] #, 'length'] #'img_path'] 
+    num_keys_to_net: 1 # num inputs for network forward func
+    #keys_for_loss: 4 # num labels for loss func
+
+  loader:
+      shuffle: True # TODO: tbc
+      batch_size: *batch_size
+      drop_remainder: True
+      max_rowsize: 6
+      num_workers: 2
diff --git a/mindocr/data/.DS_Store → mindocr/.DS_Store b/mindocr/data/.DS_Store → mindocr/.DS_Store
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,8 +3,8 @@ __pycache__/ @@
     *.py[cod]
     *$py.class
-    .swp
-    .swo
+    *.swp
+    *.swo
     # C extensions
     *.so
@@ Expand Down @@