Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CodeCamp #116 Add SROIE to dataset preparer #1639

Merged
Merged
31 changes: 31 additions & 0 deletions dataset_zoo/sroie/metafile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Name: 'Scanned Receipts OCR and Information Extraction'
Paper:
Title: ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction
URL: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8977955
Venue: ICDAR
Year: '2019'
BibTeX: '@INPROCEEDINGS{8977955,
author={Huang, Zheng and Chen, Kai and He, Jianhua and Bai, Xiang and Karatzas, Dimosthenis and Lu, Shijian and Jawahar, C. V.},
booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
title={ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction},
year={2019},
volume={},
number={},
pages={1516-1520},
doi={10.1109/ICDAR.2019.00244}}'
Data:
Website: https://rrc.cvc.uab.es/?ch=13
Language:
- English
Scene:
- Document
Granularity:
- Word
Tasks:
- textdet
- textrecog
- textspotting
License:
Type: CC BY 4.0
Link: https://creativecommons.org/licenses/by/4.0/
Format: .txt
9 changes: 9 additions & 0 deletions dataset_zoo/sroie/sample_anno.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
**Text Detection, Text Recognition and Text Spotting**

```text
# x1,y1,x2,y2,x3,y3,x4,y4,trans

72,25,326,25,326,64,72,64,TAN WOON YANN
50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND
205,121,285,121,285,139,205,139,789417-W
```
76 changes: 76 additions & 0 deletions dataset_zoo/sroie/textdet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
data_root = 'data/sroie'
cache_path = 'data/cache'

data_obtainer = dict(
type='NaiveDataObtainer',
cache_path=cache_path,
data_root=data_root,
files=[
dict(
url='https://doc-b4-c8-drive-data-export.googleusercontent.com'
'/download/fjpfehrefh7sl37sr71jeh2bhss0bjnc/frvcn2hirmtpmhokmdp'
'dfb5f9mn3vmu2/1671610500000/ab5eb9ff-0939-4733-b13e-b91ccd5ca3'
'e9/103212024722882466354/ADt3v-PlZRXqj94_z156zbyAJvCgP_INPDXqz'
'Uqez3jcQBO0TCy1AXXkuWPgRLCbyQC5x7cisBfEtqGdAHlGKo9yJnvAcv12GsA'
'HS2ppinZhj2_kFlcQqmCIUogMe2iKzsSZ3RMSbgTMIfIMSlDcZ-UTI5OfXCLR3'
'JhZ31ZCxYBYEmrqaMSrO_L55dUzNw-pgCxeE56qpoLYWQo-oGmpKfX4fLdbJkN'
'xcC49uXCHHdHifHjpTCFn0B6-XCVdp9uX0fweojHWNiyD_Z7WRzIOmrBF0MM3t'
'mlCN8O4Txu8gskuUPqvc66WzRaIFXcfEdYq_QLe4gD-8fr8L6q4-u26TsDAly2'
'f9937mA==?nonce=ou9fflkqd9ie0&user=103212024722882466354&authu'
'ser=0&hash=ert0codc6560intrkm24htb1ven4cvau',
save_name='0325updated.task1train(626p)-20221205T094142Z-001.zip',
md5='3883fa9ef6ade95a8d3b9076813ad848',
split=['train'],
content=['image', 'annotation'],
mapping=[['0325updated/0325updated.task1train(626p)/*.jpg', 'textdet_imgs/train'],
['0325updated/0325updated.task1train(626p)/*.txt', 'annotations/train']]),
dict(
url='https://doc-0o-2o-drive-data-export.googleusercontent.com/'
'download/fjpfehrefh7sl37sr71jeh2bhss0bjnc/16u970fcc7dd54arckv'
'rpjbd6ecdkull/1671538500000/b3194449-f672-4f77-8021-1a9650df6a'
'a4/103212024722882466354/ADt3v-MXYOvylQYausMHix7WhseI1d0YLyqt'
'B2r8n1QPTJebve8oJFf4_8APh0L7r-HyE0CyjhxRXf8bM909oFE2sWKEulqEnv'
'ussVQdvFh73dlC7goMbGSb1-EfUWR4wXpHpsoPYVOjw1grQiExl0v_P3LHaD9Er'
'TkZkA3ZhR-q9iKUq_7i7eyXbdhT35l34Xnal7mFap2P2ZdacoBLzD2LDI1GRqXt'
'lnMKWm4KnTZuBGBNXFxJGjjnhy3x4j9meSt_eod0vZzyfAgs5ThHM1kk6dc8pT'
'KAM84p0z2cp-N0GJgi8pLMZR8nPdxKBJNipOzp4Y_8tUGO?authuser=0&nonc'
'e=vkamrsalrga2q&user=103212024722882466354&hash=8trg3mjsqhc8hf'
'6uhim8jqirsjdp0r2l',
save_name='task1&2_test(361p)-20221205T104647Z-001.zip',
md5='1b05af23e4b38ca27d19cfe95272418f',
split=['test'],
content=['image'],
mapping=[['task1&2_test(361p)-20221205T104647Z-001/task1_2_test(361p)', 'textdet_imgs/test']]),
dict(
url='https://doc-4c-2o-drive-data-export.googleusercontent.com/'
'download/fjpfehrefh7sl37sr71jeh2bhss0bjnc/cc96iun157a4pj0p3v284'
'ra800a04jff/1671610500000/8c3588ab-67fe-4d4d-8a0c-d105e4cbcfd3/'
'103212024722882466354/ADt3v-Ovr_nucmMNIjxpVbCHhn2p5_N7rtbldYPUt'
'HF_k1dtcWih0LnE1BSUuHNhlf7MF4mcABQFhpaEIZX_GIkaDOUJ3IT8gSnMiz'
'5aaK225clwIWyEBmZHqKC3e87Gz785sWQUSRLxucU3k2JCvyfI0uwnlozNcICY'
'fRn2pPnCmGL4-iqm6MtsH9fzy5p_3nWpgw6TN5q14-2wT4CsIdMK_kPizJCzkV'
'GkYkiRmYp3AfQso1kxGn5x1h1KsI9ofz9VnJJDmLITqILz7ax8_5IIsMVkgDkxg'
'FJM8pUfdVPQw2Y1n96Hm3M0UGWM2m-ZVxQJExRLTDL-CCC1AWkLAY6EwWFmpnDt'
'dg==?nonce=6aol4ohc1essk&user=103212024722882466354&authuser=0'
'&hash=mci04vpisvduilokqck7la6as0qrd2ut',
save_name='text.task1&2-test(361p)-20221205T112052Z-001.zip',
md5='0bf94b74a5baadfe2bb0cd72889127a5',
split=['test'],
content=['annotation'],
mapping=[['text/text.task1_2-test(361p)', 'annotations/test']]),
])

data_converter = dict(
type='TextDetDataConverter',
splits=['train', 'test'],
data_root=data_root,
gatherer=dict(
type='pair_gather',
suffixes=['.jpg'],
rule=[r'X(\d+)\.([jJ][pP][gG])', r'X\1.txt']),
parser=dict(type='SROIETextDetAnnParser', encoding='utf-8-sig'),
dumper=dict(type='JsonDumper'),
delete=['text', 'task1&2_test(361p)-20221205T104647Z-001', '0325updated', 'annotations']
)

config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
5 changes: 5 additions & 0 deletions dataset_zoo/sroie/textrecog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
_base_ = ['textdet.py']

data_converter = dict(type='TextRecogCropConverter')

config_generator = dict(type='TextRecogConfigGenerator')
5 changes: 5 additions & 0 deletions dataset_zoo/sroie/textspotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
_base_ = ['textdet.py']

data_converter = dict(type='TextSpottingDataConverter')
gaotongxiao marked this conversation as resolved.
Show resolved Hide resolved

config_generator = dict(type='TextSpottingConfigGenerator')
2 changes: 2 additions & 0 deletions mmocr/datasets/preparers/data_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ def pair_gather(self, img_path: str, suffixes: List, rule: Sequence,
"""
files = list()
for file in list_files(img_path, suffixes):
if not re.match(rule[0], osp.basename(file)):
continue
file2 = re.sub(rule[0], rule[1], osp.basename(file))
file2 = file.replace(osp.basename(file), file2)
file2 = file2.replace(self.img_dir, 'annotations')
Expand Down
10 changes: 9 additions & 1 deletion mmocr/datasets/preparers/data_obtainer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
import os
import os.path as osp
import glob
import shutil
import ssl
import urllib.request as request
Expand Down Expand Up @@ -148,7 +149,14 @@ def move(self, mapping: List[Tuple[str, str]]) -> None:
for src, dst in mapping:
src = osp.join(self.data_root, src)
dst = osp.join(self.data_root, dst)
if osp.exists(src) and not osp.exists(dst):

if '*' in src:
mkdir_or_exist(dst)
for f in glob.glob(src):
if not osp.exists(osp.join(dst, osp.basename(f))):
shutil.move(f, dst)

elif osp.exists(src) and not osp.exists(dst):
shutil.move(src, dst)

def clean(self) -> None:
Expand Down
3 changes: 2 additions & 1 deletion mmocr/datasets/preparers/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from .svt_parser import SVTTextDetAnnParser
from .totaltext_parser import TotaltextTextDetAnnParser
from .wildreceipt_parser import WildreceiptKIEAnnParser
from .sroie_parser import SROIETextDetAnnParser

__all__ = [
'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
'COCOTextDetAnnParser', 'SVTTextDetAnnParser'
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'SROIETextDetAnnParser'
]
74 changes: 74 additions & 0 deletions mmocr/datasets/preparers/parsers/sroie_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Optional, Tuple

from mmocr.utils import bbox2poly
from ..data_preparer import DATA_PARSERS
from .base import BaseParser


@DATA_PARSERS.register_module()
class SROIETextDetAnnParser(BaseParser):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the difference with ICDARTextDetAnnParaser

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only difference is the try-except structure added to handle unicode errors that I want to skip. Should I merge this into the ICDARTextDetAnnParser?

"""SROIE Txt Format Text Detection Annotation Parser.

The original annotation format of this dataset is stored in txt files,
which is formed as the following format:
x1, y1, x2, y2, x3, y3, x4, y4, transcription

Args:
separator (str): The separator between each element in a line. Defaults
to ','.
ignore (str): The text to be ignored. Defaults to '###'.
format (str): The format of the annotation. Defaults to
'x1,y1,x2,y2,x3,y3,x4,trans'.
encoding (str): The encoding of the annotation file. Defaults to
'utf-8-sig'.
nproc (int): The number of processes to parse the annotation. Defaults
to 1.
remove_strs (List[str], Optional): Used to remove redundant strings in
the transcription. Defaults to None.
mode (str, optional): The mode of the box converter. Supported modes
are 'xywh' and 'xyxy'. Defaults to None.
"""

def __init__(self,
separator: str = ',',
ignore: str = '###',
format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
encoding: str = 'utf-8-sig',
nproc: int = 1,
remove_strs: Optional[List[str]] = None,
mode: str = None) -> None:
self.sep = separator
self.format = format
self.encoding = encoding
self.ignore = ignore
self.mode = mode
self.remove_strs = remove_strs
super().__init__(nproc=nproc)

def parse_file(self, file: Tuple, split: str) -> Tuple:
"""Parse single annotation."""
img_file, txt_file = file
instances = list()
try:
gaotongxiao marked this conversation as resolved.
Show resolved Hide resolved
# there might be some illegal symbols in the annotation
# which cannot be parsed by loader
for anno in self.loader(txt_file, self.sep, self.format,
self.encoding):
anno = list(anno.values())
if self.remove_strs is not None:
for strs in self.remove_strs:
for i in range(len(anno)):
if strs in anno[i]:
anno[i] = anno[i].replace(strs, '')
poly = list(map(float, anno[0:-1]))
if self.mode is not None:
poly = bbox2poly(poly, self.mode)
poly = poly.tolist()
text = anno[-1]
instances.append(
dict(poly=poly, text=text, ignore=text == self.ignore))
except Exception:
pass

return img_file, instances