Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CodeCamp #116 Add SROIE to dataset preparer #1639

Merged
Merged
15 changes: 15 additions & 0 deletions configs/textdet/_base_/datasets/sroie.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
sroie_textdet_data_root = 'data/sroie'
gaotongxiao marked this conversation as resolved.
Show resolved Hide resolved

sroie_textdet_train = dict(
type='OCRDataset',
data_root=sroie_textdet_data_root,
ann_file='textdet_train.json',
filter_cfg=dict(filter_empty_gt=True, min_size=32),
pipeline=None)

sroie_textdet_test = dict(
type='OCRDataset',
data_root=sroie_textdet_data_root,
ann_file='textdet_test.json',
test_mode=True,
pipeline=None)
14 changes: 14 additions & 0 deletions configs/textrecog/_base_/datasets/sroie.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
sroie_textrecog_data_root = 'data/sroie'
gaotongxiao marked this conversation as resolved.
Show resolved Hide resolved

sroie_textrecog_train = dict(
type='OCRDataset',
data_root=sroie_textrecog_data_root,
ann_file='textrecog_train.json',
pipeline=None)

sroie_textrecog_test = dict(
type='OCRDataset',
data_root=sroie_textrecog_data_root,
ann_file='textrecog_test.json',
test_mode=True,
pipeline=None)
31 changes: 31 additions & 0 deletions dataset_zoo/sroie/metafile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Name: 'Scanned Receipts OCR and Information Extraction'
Paper:
Title: ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction
URL: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8977955
Venue: ICDAR
Year: '2019'
BibTeX: '@INPROCEEDINGS{8977955,
author={Huang, Zheng and Chen, Kai and He, Jianhua and Bai, Xiang and Karatzas, Dimosthenis and Lu, Shijian and Jawahar, C. V.},
booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
title={ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction},
year={2019},
volume={},
number={},
pages={1516-1520},
doi={10.1109/ICDAR.2019.00244}}'
Data:
Website: https://rrc.cvc.uab.es/?ch=13
Language:
- English
Scene:
- Document
Granularity:
- Word
Tasks:
- textdet
- textrecog
- textspotting
License:
Type: CC BY 4.0
Link: https://creativecommons.org/licenses/by/4.0/
Format: .txt
9 changes: 9 additions & 0 deletions dataset_zoo/sroie/sample_anno.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
**Text Detection**
gaotongxiao marked this conversation as resolved.
Show resolved Hide resolved

```text
# x1,y1,x2,y2,x3,y3,x4,y4,trans

72,25,326,25,326,64,72,64,TAN WOON YANN
50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND
205,121,285,121,285,139,205,139,789417-W
```
76 changes: 76 additions & 0 deletions dataset_zoo/sroie/textdet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
data_root = 'data/sroie'
cache_path = 'data/cache'

data_obtainer = dict(
type='NaiveDataObtainer',
cache_path=cache_path,
data_root=data_root,
files=[
dict(
url='https://doc-b4-c8-drive-data-export.googleusercontent.com'
'/download/fjpfehrefh7sl37sr71jeh2bhss0bjnc/frvcn2hirmtpmhokmdp'
'dfb5f9mn3vmu2/1671610500000/ab5eb9ff-0939-4733-b13e-b91ccd5ca3'
'e9/103212024722882466354/ADt3v-PlZRXqj94_z156zbyAJvCgP_INPDXqz'
'Uqez3jcQBO0TCy1AXXkuWPgRLCbyQC5x7cisBfEtqGdAHlGKo9yJnvAcv12GsA'
'HS2ppinZhj2_kFlcQqmCIUogMe2iKzsSZ3RMSbgTMIfIMSlDcZ-UTI5OfXCLR3'
'JhZ31ZCxYBYEmrqaMSrO_L55dUzNw-pgCxeE56qpoLYWQo-oGmpKfX4fLdbJkN'
'xcC49uXCHHdHifHjpTCFn0B6-XCVdp9uX0fweojHWNiyD_Z7WRzIOmrBF0MM3t'
'mlCN8O4Txu8gskuUPqvc66WzRaIFXcfEdYq_QLe4gD-8fr8L6q4-u26TsDAly2'
'f9937mA==?nonce=ou9fflkqd9ie0&user=103212024722882466354&authu'
'ser=0&hash=ert0codc6560intrkm24htb1ven4cvau',
save_name='0325updated.task1train(626p)-20221205T094142Z-001.zip',
md5='3883fa9ef6ade95a8d3b9076813ad848',
split=['train'],
content=['image', 'annotation'],
mapping=[['0325updated/0325updated.task1train(626p)/*.jpg', 'textdet_imgs/train'],
['0325updated/0325updated.task1train(626p)/*.txt', 'annotations/train']]),
dict(
url='https://doc-0o-2o-drive-data-export.googleusercontent.com/'
'download/fjpfehrefh7sl37sr71jeh2bhss0bjnc/16u970fcc7dd54arckv'
'rpjbd6ecdkull/1671538500000/b3194449-f672-4f77-8021-1a9650df6a'
'a4/103212024722882466354/ADt3v-MXYOvylQYausMHix7WhseI1d0YLyqt'
'B2r8n1QPTJebve8oJFf4_8APh0L7r-HyE0CyjhxRXf8bM909oFE2sWKEulqEnv'
'ussVQdvFh73dlC7goMbGSb1-EfUWR4wXpHpsoPYVOjw1grQiExl0v_P3LHaD9Er'
'TkZkA3ZhR-q9iKUq_7i7eyXbdhT35l34Xnal7mFap2P2ZdacoBLzD2LDI1GRqXt'
'lnMKWm4KnTZuBGBNXFxJGjjnhy3x4j9meSt_eod0vZzyfAgs5ThHM1kk6dc8pT'
'KAM84p0z2cp-N0GJgi8pLMZR8nPdxKBJNipOzp4Y_8tUGO?authuser=0&nonc'
'e=vkamrsalrga2q&user=103212024722882466354&hash=8trg3mjsqhc8hf'
'6uhim8jqirsjdp0r2l',
save_name='task1&2_test(361p)-20221205T104647Z-001.zip',
md5='1b05af23e4b38ca27d19cfe95272418f',
split=['test'],
content=['image'],
mapping=[['task1&2_test(361p)-20221205T104647Z-001/task1_2_test(361p)', 'textdet_imgs/test']]),
dict(
url='https://doc-4c-2o-drive-data-export.googleusercontent.com/'
'download/fjpfehrefh7sl37sr71jeh2bhss0bjnc/cc96iun157a4pj0p3v284'
'ra800a04jff/1671610500000/8c3588ab-67fe-4d4d-8a0c-d105e4cbcfd3/'
'103212024722882466354/ADt3v-Ovr_nucmMNIjxpVbCHhn2p5_N7rtbldYPUt'
'HF_k1dtcWih0LnE1BSUuHNhlf7MF4mcABQFhpaEIZX_GIkaDOUJ3IT8gSnMiz'
'5aaK225clwIWyEBmZHqKC3e87Gz785sWQUSRLxucU3k2JCvyfI0uwnlozNcICY'
'fRn2pPnCmGL4-iqm6MtsH9fzy5p_3nWpgw6TN5q14-2wT4CsIdMK_kPizJCzkV'
'GkYkiRmYp3AfQso1kxGn5x1h1KsI9ofz9VnJJDmLITqILz7ax8_5IIsMVkgDkxg'
'FJM8pUfdVPQw2Y1n96Hm3M0UGWM2m-ZVxQJExRLTDL-CCC1AWkLAY6EwWFmpnDt'
'dg==?nonce=6aol4ohc1essk&user=103212024722882466354&authuser=0'
'&hash=mci04vpisvduilokqck7la6as0qrd2ut',
save_name='text.task1&2-test(361p)-20221205T112052Z-001.zip',
md5='0bf94b74a5baadfe2bb0cd72889127a5',
split=['test'],
content=['annotation'],
mapping=[['text/text.task1_2-test(361p)', 'annotations/test']]),
])

data_converter = dict(
type='TextDetDataConverter',
splits=['train', 'test'],
data_root=data_root,
gatherer=dict(
type='pair_gather',
suffixes=['.jpg'],
rule=[r'X(\d+)\.([jJ][pP][gG])', r'X\1.txt']),
parser=dict(type='SROIETextDetAnnParser', encoding='utf-8-sig'),
dumper=dict(type='JsonDumper'),
delete=['text', 'task1&2_test(361p)-20221205T104647Z-001', '0325updated', 'annotations']
)

config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
77 changes: 77 additions & 0 deletions dataset_zoo/sroie/textrecog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
data_root = 'data/sroie'
gaotongxiao marked this conversation as resolved.
Show resolved Hide resolved
cache_path = 'data/cache'

data_obtainer = dict(
type='NaiveDataObtainer',
cache_path=cache_path,
data_root=data_root,
files=[
dict(
url='https://doc-b4-c8-drive-data-export.googleusercontent.com'
'/download/fjpfehrefh7sl37sr71jeh2bhss0bjnc/frvcn2hirmtpmhokmdp'
'dfb5f9mn3vmu2/1671610500000/ab5eb9ff-0939-4733-b13e-b91ccd5ca3'
'e9/103212024722882466354/ADt3v-PlZRXqj94_z156zbyAJvCgP_INPDXqz'
'Uqez3jcQBO0TCy1AXXkuWPgRLCbyQC5x7cisBfEtqGdAHlGKo9yJnvAcv12GsA'
'HS2ppinZhj2_kFlcQqmCIUogMe2iKzsSZ3RMSbgTMIfIMSlDcZ-UTI5OfXCLR3'
'JhZ31ZCxYBYEmrqaMSrO_L55dUzNw-pgCxeE56qpoLYWQo-oGmpKfX4fLdbJkN'
'xcC49uXCHHdHifHjpTCFn0B6-XCVdp9uX0fweojHWNiyD_Z7WRzIOmrBF0MM3t'
'mlCN8O4Txu8gskuUPqvc66WzRaIFXcfEdYq_QLe4gD-8fr8L6q4-u26TsDAly2'
'f9937mA==?nonce=ou9fflkqd9ie0&user=103212024722882466354&authu'
'ser=0&hash=ert0codc6560intrkm24htb1ven4cvau',
save_name='0325updated.task1train(626p)-20221205T094142Z-001.zip',
md5='3883fa9ef6ade95a8d3b9076813ad848',
split=['train'],
content=['image', 'annotation'],
mapping=[['0325updated/0325updated.task1train(626p)/*.jpg', 'textdet_imgs/train'],
['0325updated/0325updated.task1train(626p)/*.txt', 'annotations/train']]),
dict(
url='https://doc-0o-2o-drive-data-export.googleusercontent.com/'
'download/fjpfehrefh7sl37sr71jeh2bhss0bjnc/16u970fcc7dd54arckv'
'rpjbd6ecdkull/1671538500000/b3194449-f672-4f77-8021-1a9650df6a'
'a4/103212024722882466354/ADt3v-MXYOvylQYausMHix7WhseI1d0YLyqt'
'B2r8n1QPTJebve8oJFf4_8APh0L7r-HyE0CyjhxRXf8bM909oFE2sWKEulqEnv'
'ussVQdvFh73dlC7goMbGSb1-EfUWR4wXpHpsoPYVOjw1grQiExl0v_P3LHaD9Er'
'TkZkA3ZhR-q9iKUq_7i7eyXbdhT35l34Xnal7mFap2P2ZdacoBLzD2LDI1GRqXt'
'lnMKWm4KnTZuBGBNXFxJGjjnhy3x4j9meSt_eod0vZzyfAgs5ThHM1kk6dc8pT'
'KAM84p0z2cp-N0GJgi8pLMZR8nPdxKBJNipOzp4Y_8tUGO?authuser=0&nonc'
'e=vkamrsalrga2q&user=103212024722882466354&hash=8trg3mjsqhc8hf'
'6uhim8jqirsjdp0r2l',
save_name='task1&2_test(361p)-20221205T104647Z-001.zip',
md5='1b05af23e4b38ca27d19cfe95272418f',
split=['test'],
content=['image'],
mapping=[['task1&2_test(361p)-20221205T104647Z-001/task1_2_test(361p)', 'textdet_imgs/test']]),
dict(
url='https://doc-4c-2o-drive-data-export.googleusercontent.com/'
'download/fjpfehrefh7sl37sr71jeh2bhss0bjnc/cc96iun157a4pj0p3v284'
'ra800a04jff/1671610500000/8c3588ab-67fe-4d4d-8a0c-d105e4cbcfd3/'
'103212024722882466354/ADt3v-Ovr_nucmMNIjxpVbCHhn2p5_N7rtbldYPUt'
'HF_k1dtcWih0LnE1BSUuHNhlf7MF4mcABQFhpaEIZX_GIkaDOUJ3IT8gSnMiz'
'5aaK225clwIWyEBmZHqKC3e87Gz785sWQUSRLxucU3k2JCvyfI0uwnlozNcICY'
'fRn2pPnCmGL4-iqm6MtsH9fzy5p_3nWpgw6TN5q14-2wT4CsIdMK_kPizJCzkV'
'GkYkiRmYp3AfQso1kxGn5x1h1KsI9ofz9VnJJDmLITqILz7ax8_5IIsMVkgDkxg'
'FJM8pUfdVPQw2Y1n96Hm3M0UGWM2m-ZVxQJExRLTDL-CCC1AWkLAY6EwWFmpnDt'
'dg==?nonce=6aol4ohc1essk&user=103212024722882466354&authuser=0'
'&hash=mci04vpisvduilokqck7la6as0qrd2ut',
save_name='text.task1&2-test(361p)-20221205T112052Z-001.zip',
md5='0bf94b74a5baadfe2bb0cd72889127a5',
split=['test'],
content=['annotation'],
mapping=[['text/text.task1_2-test(361p)', 'annotations/test']]),
])


data_converter = dict(
type='TextRecogCropConverter',
splits=['train', 'test'],
data_root=data_root,
gatherer=dict(
type='pair_gather',
suffixes=['.jpg'],
rule=[r'X(\d+)\.([jJ][pP][gG])', r'X\1.txt']),
parser=dict(type='SROIETextDetAnnParser', encoding='utf-8-sig'),
dumper=dict(type='JsonDumper'),
delete=['text', 'task1&2_test(361p)-20221205T104647Z-001', '0325updated', 'annotations']
)

config_generator = dict(type='TextRecogConfigGenerator', data_root=data_root)
3 changes: 3 additions & 0 deletions dataset_zoo/sroie/textspotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
_base_ = ['textdet.py']

data_converter = dict(type='TextSpottingDataConverter')
gaotongxiao marked this conversation as resolved.
Show resolved Hide resolved
11 changes: 7 additions & 4 deletions mmocr/datasets/preparers/data_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,13 @@ def pair_gather(self, img_path: str, suffixes: List, rule: Sequence,
"""
files = list()
for file in list_files(img_path, suffixes):
file2 = re.sub(rule[0], rule[1], osp.basename(file))
file2 = file.replace(osp.basename(file), file2)
file2 = file2.replace(self.img_dir, 'annotations')
files.append((file, file2))
if not re.match(rule[0], osp.basename(file)):
continue
else:
file2 = re.sub(rule[0], rule[1], osp.basename(file))
file2 = file.replace(osp.basename(file), file2)
file2 = file2.replace(self.img_dir, 'annotations')
files.append((file, file2))
FerryHuang marked this conversation as resolved.
Show resolved Hide resolved

return files

Expand Down
9 changes: 9 additions & 0 deletions mmocr/datasets/preparers/data_obtainer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
import os
import os.path as osp
import glob
import shutil
import ssl
import urllib.request as request
Expand Down Expand Up @@ -148,6 +149,14 @@ def move(self, mapping: List[Tuple[str, str]]) -> None:
for src, dst in mapping:
src = osp.join(self.data_root, src)
dst = osp.join(self.data_root, dst)

if '*' in src.split('/')[-1] and not osp.exists(dst):
os.makedirs(dst)
for f in glob.glob(src):
if osp.exists(f):
shutil.move(f, dst)
continue

if osp.exists(src) and not osp.exists(dst):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These lines can be optimized to be more general. The if condition is too strict and may only be applicable to this dataset.

  1. Wildcard (*) does not usually appear in the file path, and therefore we can use glob to match all the files whenever there is * in the file path.
  2. mmengine has a util mkdir_or_exist to create the directory whenever needed.
  3. Is line 156 really necessary? I think glob.glob should return a list of existing files.
  4. Using continue to skip the next if block doesn't make the logic flow very clear. I'm in favor of if...elif... here

To conclude, the code can be rewritten as:

import mmengine

#...

if '*' in src:
    mmengine.mkdir_or_exist(dst)
    for f in glob.glob(src):
        shutil.move(f, dst)
elif osp.exists(src) and not osp.exists(dst):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your suggestions and I have modified this code block. But with #1639 (comment) fixed, this reports error like 'path already exists' when running task-recog after task-det. Keeping the not osp.exists(dst) in if statement will be fine. So it's like this

if '*' in src and not osp.exists(dst):
    mkdir_or_exist(dst)
    for f in glob.glob(src):
        shutil.move(f, dst)

elif osp.exists(src) and not osp.exists(dst):

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seem to address the key issue. Such an error occurs because shutil will check the existence of the destination file. If dst is just a directory path, shutil will do the check on the resulting filename, which is equivalent to osp.exists(osp.join(dst, osp.basename(src)).

Back to your implementation: ... and not osp.exists(dst) only verifies the existence of the directory instead of a specific file. Suppose that we have placed an empty directory to dst, this implementation will still skip moving all the files inside.

Therefore, it's better to just check the existence of each file with osp.exists(osp.join(dst, osp.basename(f)) and only skip those existing ones.

shutil.move(src, dst)

Expand Down
3 changes: 2 additions & 1 deletion mmocr/datasets/preparers/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from .svt_parser import SVTTextDetAnnParser
from .totaltext_parser import TotaltextTextDetAnnParser
from .wildreceipt_parser import WildreceiptKIEAnnParser
from .sroie_parser import SROIETextDetAnnParser

__all__ = [
'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
'COCOTextDetAnnParser', 'SVTTextDetAnnParser'
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'SROIETextDetAnnParser'
]
72 changes: 72 additions & 0 deletions mmocr/datasets/preparers/parsers/sroie_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Optional, Tuple

from mmocr.utils import bbox2poly
from ..data_preparer import DATA_PARSERS
from .base import BaseParser


@DATA_PARSERS.register_module()
class SROIETextDetAnnParser(BaseParser):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the difference with ICDARTextDetAnnParaser

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only difference is the try-except structure added to handle unicode errors that I want to skip. Should I merge this into the ICDARTextDetAnnParser?

"""ICDAR Txt Format Text Detection Annotation Parser.
gaotongxiao marked this conversation as resolved.
Show resolved Hide resolved

The original annotation format of this dataset is stored in txt files,
which is formed as the following format:
x1, y1, x2, y2, x3, y3, x4, y4, transcription

Args:
separator (str): The separator between each element in a line. Defaults
to ','.
ignore (str): The text to be ignored. Defaults to '###'.
format (str): The format of the annotation. Defaults to
'x1,y1,x2,y2,x3,y3,x4,trans'.
encoding (str): The encoding of the annotation file. Defaults to
'utf-8-sig'.
nproc (int): The number of processes to parse the annotation. Defaults
to 1.
remove_strs (List[str], Optional): Used to remove redundant strings in
the transcription. Defaults to None.
mode (str, optional): The mode of the box converter. Supported modes
are 'xywh' and 'xyxy'. Defaults to None.
"""

def __init__(self,
separator: str = ',',
ignore: str = '###',
format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
encoding: str = 'utf-8-sig',
nproc: int = 1,
remove_strs: Optional[List[str]] = None,
mode: str = None) -> None:
self.sep = separator
self.format = format
self.encoding = encoding
self.ignore = ignore
self.mode = mode
self.remove_strs = remove_strs
super().__init__(nproc=nproc)

def parse_file(self, file: Tuple, split: str) -> Tuple:
"""Parse single annotation."""
img_file, txt_file = file
instances = list()
try:
gaotongxiao marked this conversation as resolved.
Show resolved Hide resolved
for anno in self.loader(txt_file, self.sep, self.format,
self.encoding):
anno = list(anno.values())
if self.remove_strs is not None:
for strs in self.remove_strs:
for i in range(len(anno)):
if strs in anno[i]:
anno[i] = anno[i].replace(strs, '')
poly = list(map(float, anno[0:-1]))
if self.mode is not None:
poly = bbox2poly(poly, self.mode)
poly = poly.tolist()
text = anno[-1]
instances.append(
dict(poly=poly, text=text, ignore=text == self.ignore))
except Exception:
pass

return img_file, instances