Skip to content

use default ser ckpt in vi_layoutxlm_ser #789

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions mindocr/models/kie_layoutxlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ def _cfg(url="", **kwargs):


default_cfgs = {
"layoutxlm": _cfg(
url="https://download.mindspore.cn/toolkits/mindocr/layoutxlm/ser_layoutxlm_base-a4ea148e.ckpt"
),
"vi_layoutxlm": _cfg(
url="https://download.mindspore.cn/toolkits/mindocr/vi-layoutxlm/ser_vi_layoutxlm-f3c83585.ckpt"
),
Expand Down Expand Up @@ -65,19 +68,25 @@ def layoutxlm_ser(
}
model = KieNet(model_config)
if pretrained:
default_cfg = default_cfgs["vi_layoutxlm"]
default_cfg = default_cfgs["layoutxlm"]
load_pretrained(model, default_cfg)

return model


@register_model
def vi_layoutxlm_ser(pretrained: bool = True, use_visual_backbone: bool = False, use_float16: bool = False, **kwargs):
def vi_layoutxlm_ser(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

训练脚本是不是也要适配

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

不影响训练脚本,训练配置的是backbone的pretrained参数,这里并没有改动backbone网络的参数。

pretrained: bool = True,
pretrained_backbone: bool = False,
use_visual_backbone: bool = False,
use_float16: bool = False,
**kwargs
):
model_config = {
"type": "kie",
"backbone": {
"name": "layoutxlm",
"pretrained": pretrained, # backbone pretrained
"pretrained": pretrained_backbone, # backbone pretrained
"use_visual_backbone": use_visual_backbone,
"use_float16": use_float16,
},
Expand All @@ -90,5 +99,8 @@ def vi_layoutxlm_ser(pretrained: bool = True, use_visual_backbone: bool = False,
},
}
model = KieNet(model_config)
if pretrained:
default_cfg = default_cfgs["vi_layoutxlm"]
load_pretrained(model, default_cfg)

return model
15 changes: 12 additions & 3 deletions tools/infer/text/predict_ser.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(self, args):
# build model for algorithm with pretrained weights or local checkpoint
ckpt_dir = args.ser_model_dir
if ckpt_dir is None:
pretrained = False
pretrained = True
ckpt_load_path = None
pretrained_backbone = False
else:
Expand All @@ -64,7 +64,10 @@ def __init__(self, args):
)
model_name = algo_to_model_name[args.ser_algorithm]
self.model = build_model(
model_name, pretrained=pretrained, pretrained_backbone=pretrained_backbone, ckpt_load_path=ckpt_load_path
model_name,
pretrained=pretrained,
pretrained_backbone=pretrained_backbone,
ckpt_load_path=ckpt_load_path,
)
self.model.set_train(False)

Expand Down Expand Up @@ -202,6 +205,7 @@ def run_batchwise(self, ocr_info_list):
token_type_ids_batch = []
segment_offset_ids_batch = []
ocr_infos_batch = []
image_batch = []
for j in range(batch_begin, batch_end): # image index j
data = self.preprocess(ocr_info_list[j])
input_ids_batch.append(data["input_ids"])
Expand All @@ -210,6 +214,7 @@ def run_batchwise(self, ocr_info_list):
token_type_ids_batch.append(data["token_type_ids"])
segment_offset_ids_batch.append(data["segment_offset_id"])
ocr_infos_batch.append(data["ocr_info"])
image_batch.append(data["image"])

input_ids_batch = (
np.stack(input_ids_batch) if len(input_ids_batch) > 1 else np.expand_dims(input_ids_batch[0], axis=0)
Expand All @@ -225,13 +230,15 @@ def run_batchwise(self, ocr_info_list):
if len(token_type_ids_batch) > 1
else np.expand_dims(token_type_ids_batch[0], axis=0)
)
image_batch = np.stack(image_batch) if len(image_batch) > 1 else np.expand_dims(image_batch[0], axis=0)

# infer
input_x = [
Tensor(input_ids_batch),
Tensor(bbox_batch),
Tensor(attention_mask_batch),
Tensor(token_type_ids_batch),
Tensor(image_batch),
]
logits = self.model(input_x)
# postprocess
Expand Down Expand Up @@ -262,13 +269,15 @@ def run_single(self, ocr_info_list):
token_type_ids = data["token_type_ids"]
segment_offset_id = data["segment_offset_id"]
ocr_info = data["ocr_info"]
image = data["image"]

input_ids = np.expand_dims(input_ids, axis=0)
bbox = np.expand_dims(bbox, axis=0)
attention_mask = np.expand_dims(attention_mask, axis=0)
token_type_ids = np.expand_dims(token_type_ids, axis=0)
image = np.expand_dims(image, axis=0)

input_x = (Tensor(input_ids), Tensor(bbox), Tensor(attention_mask), Tensor(token_type_ids))
input_x = (Tensor(input_ids), Tensor(bbox), Tensor(attention_mask), Tensor(token_type_ids), Tensor(image))

logits = self.model(input_x)

Expand Down
Loading