Skip to content

Refactor recogniton data pipeline #430

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jun 27, 2023
90 changes: 42 additions & 48 deletions configs/rec/crnn/crnn_resnet34.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
system:
mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
distribute: True
amp_level: 'O3'
amp_level: "O3"
seed: 42
log_interval: 100
val_while_train: True
drop_overflow_update: False
ckpt_max_keep: 5

common:
character_dict_path: &character_dict_path #mindocr/utils/dict/en_dict.txt
num_classes: &num_classes 37 # num_chars_in_dict+1, TODO: retreive it from dict or check correctness
character_dict_path: &character_dict_path
num_classes: &num_classes 37 # num_chars_in_dict + 1
max_text_len: &max_text_len 24
infer_mode: &infer_mode False
use_space_char: &use_space_char False
Expand Down Expand Up @@ -44,8 +45,8 @@ metric:

loss:
name: CTCLoss
pred_seq_len: 25 # TODO: retrieve from the network output shape.
max_label_len: *max_text_len # this value should be smaller than pre_seq_len
pred_seq_len: 25
max_label_len: *max_text_len # this value should be smaller than pre_seq_len
batch_size: *batch_size

scheduler:
Expand All @@ -68,48 +69,45 @@ loss_scaler:
loss_scale: 512

train:
ckpt_save_dir: './tmp_rec'
ckpt_save_dir: "./tmp_rec"
pred_cast_fp32: False # let CTCLoss cast internally
dataset_sink_mode: False
dataset:
type: LMDBDataset
dataset_root: path/to/data_lmdb_release/ # Optional, if set, dataset_root will be used as a prefix for data_dir
data_dir: training/
# label_file: # not required when using LMDBDataset
label_file: null # not required when using LMDBDataset
sample_ratio: 1.0
shuffle: True
transform_pipeline:
- DecodeImage:
img_mode: BGR
to_float32: False
- Decode:
- RecCTCLabelEncode:
max_text_len: *max_text_len
character_dict_path: *character_dict_path
use_space_char: *use_space_char
lower: True
- RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize.
- RecResizeImg:
image_shape: [32, 100] # H, W
infer_mode: *infer_mode
character_dict_path: *character_dict_path
padding: False # aspect ratio will be preserved if true.
- NormalizeImage: # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec.
bgr_to_rgb: True
is_hwc: True
mean : [127.0, 127.0, 127.0]
std : [127.0, 127.0, 127.0]
- Normalize:
mean: [127.0, 127.0, 127.0]
std: [127.0, 127.0, 127.0]
- HWC2CHW:
# the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
output_columns: ['image', 'text_seq'] #, 'length'] #'img_path']
output_columns: ["image", "text_seq"]
net_input_column_index: [0] # input indices for network forward func in output_columns
label_column_index: [1] # input indices marked as label
#keys_for_loss: 4 # num labels for loss func

loader:
shuffle: True # TODO: tbc
batch_size: *batch_size
drop_remainder: True
max_rowsize: 12
num_workers: 8
shuffle: True
batch_size: *batch_size
drop_remainder: True
max_rowsize: 12
num_workers: 1
num_workers_dataset: 1
using_multiprocess_for_pipeline: False

eval:
ckpt_load_path: ./tmp_rec/best.ckpt
Expand All @@ -118,13 +116,11 @@ eval:
type: LMDBDataset
dataset_root: path/to/data_lmdb_release/
data_dir: validation/
# label_file: # not required when using LMDBDataset
label_file: null # not required when using LMDBDataset
sample_ratio: 1.0
shuffle: False
transform_pipeline:
- DecodeImage:
img_mode: RGB
to_float32: False
- Decode:
- RecCTCLabelEncode:
max_text_len: *max_text_len
character_dict_path: *character_dict_path
Expand All @@ -138,16 +134,17 @@ eval:
norm_before_pad: False
- HWC2CHW:
# the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
output_columns: ['image', 'text_padded', 'text_length'] # TODO return text string padding w/ fixed length, and a scaler to indicate the length
output_columns: ["image", "text_padded", "text_length"]
net_input_column_index: [0] # input indices for network forward func in output_columns
label_column_index: [1, 2] # input indices marked as label

loader:
shuffle: False # TODO: tbc
batch_size: 64
drop_remainder: False
max_rowsize: 12
num_workers: 8
shuffle: False
batch_size: 64
drop_remainder: False
max_rowsize: 12
num_workers: 4
num_workers_dataset: 1

predict:
ckpt_load_path: ./tmp_rec/best.ckpt
Expand All @@ -164,28 +161,25 @@ predict:
- DecodeImage:
img_mode: BGR
to_float32: False
# - RecCTCLabelEncode:
# max_text_len: *max_text_len
# character_dict_path: *character_dict_path
# use_space_char: *use_space_char
# lower: True
- RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize.
- RecResizeImg:
image_shape: [32, 100] # H, W
infer_mode: *infer_mode
character_dict_path: *character_dict_path
padding: False # aspect ratio will be preserved if true.
- NormalizeImage: # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec.
- NormalizeImage:
bgr_to_rgb: True
is_hwc: True
mean : [127.0, 127.0, 127.0]
std : [127.0, 127.0, 127.0]
mean: [127.0, 127.0, 127.0]
std: [127.0, 127.0, 127.0]
- ToCHWImage:
# the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
output_columns: [ 'img_path', 'image', 'raw_img_shape' ]
output_columns: ["img_path", "image", "raw_img_shape"]

loader:
shuffle: False # TODO: tbc
batch_size: 1
drop_remainder: True
max_rowsize: 12
num_workers: 8
shuffle: False
batch_size: 1
drop_remainder: True
max_rowsize: 12
num_workers: 1
num_workers_dataset: 1
using_multiprocess_for_pipeline: False
24 changes: 11 additions & 13 deletions configs/rec/crnn/crnn_resnet34_ch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,7 @@ train:
filter_max_len: True
max_text_len: *max_text_len
transform_pipeline:
- DecodeImage:
img_mode: BGR
to_float32: False
- Decode:
- RecCTCLabelEncode:
max_text_len: *max_text_len
character_dict_path: *character_dict_path
Expand All @@ -99,12 +97,10 @@ train:
infer_mode: *infer_mode
character_dict_path: *character_dict_path
padding: True
- NormalizeImage:
bgr_to_rgb: True
is_hwc: True
- Normalize:
mean: [127.0, 127.0, 127.0]
std: [127.0, 127.0, 127.0]
- ToCHWImage:
- HWC2CHW:
output_columns: ["image", "text_seq"]
net_input_column_index: [0]
label_column_index: [1]
Expand All @@ -113,8 +109,10 @@ train:
shuffle: True
batch_size: *batch_size
drop_remainder: True
max_rowsize: 12
max_rowsize: 24
num_workers: 1
num_workers_dataset: 1
using_multiprocess_for_pipeline: False

eval:
ckpt_load_path: ./tmp_rec/best.ckpt
Expand All @@ -127,9 +125,7 @@ eval:
sample_ratio: 1.0
shuffle: False
transform_pipeline:
- DecodeImage:
img_mode: RGB
to_float32: False
- Decode:
- RecCTCLabelEncode:
max_text_len: *max_text_len
character_dict_path: *character_dict_path
Expand All @@ -144,7 +140,7 @@ eval:
keep_ratio: True
padding: True
norm_before_pad: False
- ToCHWImage:
- HWC2CHW:
output_columns: ["image", "text_padded", "text_length"]
net_input_column_index: [0]
label_column_index: [1, 2]
Expand All @@ -153,5 +149,7 @@ eval:
shuffle: False
batch_size: 64
drop_remainder: False
max_rowsize: 12
max_rowsize: 24
num_workers: 1
num_workers_dataset: 1
using_multiprocess_for_pipeline: False
60 changes: 29 additions & 31 deletions configs/rec/crnn/crnn_vgg7.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
system:
mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
distribute: True
amp_level: 'O3'
amp_level: "O3"
seed: 42
log_interval: 100
val_while_train: True
drop_overflow_update: False
ckpt_max_keep: 5

common:
character_dict_path: &character_dict_path #mindocr/utils/dict/en_dict.txt
character_dict_path: &character_dict_path #mindocr/utils/dict/en_dict.txt

num_classes: &num_classes 37 # num_chars_in_dict+1, TODO: retreive it from dict or check correctness
max_text_len: &max_text_len 23
infer_mode: &infer_mode False
Expand Down Expand Up @@ -45,7 +47,7 @@ metric:
loss:
name: CTCLoss
pred_seq_len: 24 # TODO: retrieve from the network output shape.
max_label_len: *max_text_len # this value should be smaller than pre_seq_len
max_label_len: *max_text_len # this value should be smaller than pre_seq_len
batch_size: *batch_size

scheduler:
Expand All @@ -62,14 +64,13 @@ optimizer:
momentum: 0.95
weight_decay: 0.0001
nesterov: False
#use_nesterov: True

loss_scaler:
type: static
loss_scale: 1024

train:
ckpt_save_dir: './tmp_rec'
ckpt_save_dir: "./tmp_rec"
pred_cast_fp32: False # let CTCLoss cast internally
dataset_sink_mode: False
dataset:
Expand All @@ -80,40 +81,37 @@ train:
sample_ratio: 1.0
shuffle: True
transform_pipeline:
- DecodeImage:
img_mode: BGR
to_float32: False
- Decode:
- RecCTCLabelEncode:
max_text_len: *max_text_len
character_dict_path: *character_dict_path
use_space_char: *use_space_char
lower: True
- RecResizeImg: # different from paddle (paddle converts image from HWC to CHW and rescale to [-1, 1] after resize.
- RecResizeImg:
image_shape: [32, 100] # H, W
infer_mode: *infer_mode
character_dict_path: *character_dict_path
padding: False # aspect ratio will be preserved if true.
- NormalizeImage: # different from paddle (paddle wrongly normalize BGR image with RGB mean/std from ImageNet for det, and simple rescale to [-1, 1] in rec.
bgr_to_rgb: True
is_hwc: True
mean : [127.0, 127.0, 127.0]
std : [127.0, 127.0, 127.0]
- Normalize:
mean: [127.0, 127.0, 127.0]
std: [127.0, 127.0, 127.0]
- HWC2CHW:
# the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
output_columns: ['image', 'text_seq'] #, 'length'] #'img_path']
output_columns: ["image", "text_seq"]
net_input_column_index: [0] # input indices for network forward func in output_columns
label_column_index: [1] # input indices marked as label
#keys_for_loss: 4 # num labels for loss func

loader:
shuffle: True # TODO: tbc
batch_size: *batch_size
drop_remainder: True
max_rowsize: 12
num_workers: 8
shuffle: True
batch_size: *batch_size
drop_remainder: True
max_rowsize: 12
num_workers: 1
num_workers_dataset: 1
using_multiprocess_for_pipeline: False

eval:
ckpt_load_path: './tmp_rec/best.ckpt'
ckpt_load_path: "./tmp_rec/best.ckpt"
dataset_sink_mode: False
dataset:
type: LMDBDataset
Expand All @@ -123,9 +121,7 @@ eval:
sample_ratio: 1.0
shuffle: False
transform_pipeline:
- DecodeImage:
img_mode: RGB
to_float32: False
- Decode:
- RecCTCLabelEncode:
max_text_len: *max_text_len
character_dict_path: *character_dict_path
Expand All @@ -139,13 +135,15 @@ eval:
norm_before_pad: False
- HWC2CHW:
# the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
output_columns: ['image', 'text_padded', 'text_length'] # TODO return text string padding w/ fixed length, and a scaler to indicate the length
output_columns: ["image", "text_padded", "text_length"] # TODO return text string padding w/ fixed length, and a scaler to indicate the length
net_input_column_index: [0] # input indices for network forward func in output_columns
label_column_index: [1, 2] # input indices marked as label

loader:
shuffle: False # TODO: tbc
batch_size: 16
drop_remainder: False
max_rowsize: 12
num_workers: 8
shuffle: False
batch_size: 64
drop_remainder: False
max_rowsize: 12
num_workers: 1
num_workers_dataset: 1
using_multiprocess_for_pipeline: False
Loading