Skip to content

Commit

Permalink
updata ctw1500 to new dataprepare design
Browse files Browse the repository at this point in the history
  • Loading branch information
Harold-lkk committed Mar 6, 2023
1 parent 9e964f6 commit 00349d1
Show file tree
Hide file tree
Showing 10 changed files with 205 additions and 109 deletions.
65 changes: 0 additions & 65 deletions dataset_zoo/ctw/textdet.py

This file was deleted.

9 changes: 0 additions & 9 deletions dataset_zoo/ctw/textrecog.py

This file was deleted.

9 changes: 0 additions & 9 deletions dataset_zoo/ctw/textspotting.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Name: 'CTW'
Name: 'SCUT-CTW1500'
Paper:
Title: Curved scene text detection via transverse and longitudinal sequence connection
URL: https://www.sciencedirect.com/science/article/pii/S0031320319300664
Expand Down
76 changes: 76 additions & 0 deletions dataset_zoo/scut-ctw1500/textdet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
data_root = 'data/ctw1500'
cache_path = 'data/cache'

train_preparer = dict(
obtainer=dict(
type='NaiveDataObtainer',
cache_path=cache_path,
files=[
dict(
url='https://universityofadelaide.box.com/shared/static/'
'py5uwlfyyytbb2pxzq9czvu6fuqbjdh8.zip',
save_name='ctw1500_train_images.zip',
md5='f1453464b764343040644464d5c0c4fa',
split=['train'],
content=['image'],
mapping=[[
'ctw1500_train_images/train_images', 'textdet_imgs/train'
]]),
dict(
url='https://universityofadelaide.box.com/shared/static/'
'jikuazluzyj4lq6umzei7m2ppmt3afyw.zip',
save_name='ctw1500_train_labels.zip',
md5='d9ba721b25be95c2d78aeb54f812a5b1',
split=['train'],
content=['annotation'],
mapping=[[
'ctw1500_train_labels/ctw1500_train_labels/',
'annotations/train'
]])
]),
gatherer=dict(
type='PairGatherer',
img_suffixes=['.jpg', '.JPG'],
rule=[r'(\d{4}).jpg', r'\1.xml']),
parser=dict(type='CTW1500AnnParser'),
packer=dict(type='TextDetPacker'),
dumper=dict(type='JsonDumper'),
)

test_preparer = dict(
obtainer=dict(
type='NaiveDataObtainer',
cache_path=cache_path,
files=[
dict(
url='https://universityofadelaide.box.com/shared/static/'
't4w48ofnqkdw7jyc4t11nsukoeqk9c3d.zip',
save_name='ctw1500_test_images.zip',
md5='79103fd77dfdd2c70ae6feb3a2fb4530',
split=['test'],
content=['image'],
mapping=[[
'ctw1500_test_images/test_images', 'textdet_imgs/test'
]]),
dict(
url='https://cloudstor.aarnet.edu.au/plus/s/uoeFl0pCN9BOCN5/'
'download',
save_name='ctw1500_test_labels.zip',
md5='7f650933a30cf1bcdbb7874e4962a52b',
split=['test'],
content=['annotation'],
mapping=[['ctw1500_test_labels', 'annotations/test']])
]),
gatherer=dict(
type='PairGatherer',
img_suffixes=['.jpg', '.JPG'],
rule=[r'(\d{4}).jpg', r'000\1.txt']),
parser=dict(type='CTW1500AnnParser'),
packer=dict(type='TextDetPacker'),
dumper=dict(type='JsonDumper'),
)
delete = [
'ctw1500_train_images', 'ctw1500_test_images', 'annotations',
'ctw1500_train_labels', 'ctw1500_test_labels'
]
config_generator = dict(type='TextDetConfigGenerator')
9 changes: 9 additions & 0 deletions dataset_zoo/scut-ctw1500/textrecog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
_base_ = ['textdet.py']

_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'

_base_.train_preparer.packer.type = 'TextRecogCropPacker'
_base_.test_preparer.packer.type = 'TextRecogCropPacker'

config_generator = dict(type='TextRecogConfigGenerator')
9 changes: 9 additions & 0 deletions dataset_zoo/scut-ctw1500/textspotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
_base_ = ['textdet.py']

_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'

_base_.train_preparer.packer.type = 'TextSpottingPacker'
_base_.test_preparer.packer.type = 'TextSpottingPacker'

config_generator = dict(type='TextSpottingConfigGenerator')
4 changes: 2 additions & 2 deletions mmocr/datasets/preparers/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .base import BaseParser
from .coco_parser import COCOTextDetAnnParser
from .ctw_parser import CTWAnnParser
from .ctw1500_parser import CTW1500AnnParser
from .funsd_parser import FUNSDTextDetAnnParser
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
ICDARTxtTextRecogAnnParser)
Expand All @@ -15,5 +15,5 @@
'BaseParser', 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
'SROIETextDetAnnParser', 'NAFAnnParser', 'CTWAnnParser'
'SROIETextDetAnnParser', 'NAFAnnParser', 'CTW1500AnnParser'
]
Original file line number Diff line number Diff line change
Expand Up @@ -10,39 +10,52 @@


@DATA_PARSERS.register_module()
class CTWAnnParser(BaseParser):
"""SCUT-CTW dataset parser.
class CTW1500AnnParser(BaseParser):
"""SCUT-CTW1500 dataset parser.
Args:
data_root (str): Path to the dataset root.
ignore (list(str)): The text of the ignored instances. Default: ['#'].
nproc (int): Number of processes to load the data. Default: 1.
ignore (str): The text of the ignored instances. Defaults to
'###'.
"""

def __init__(self,
data_root: str,
ignore: List[str] = ['#'],
nproc: int = 1) -> None:
def __init__(self, ignore: str = '###', **kwargs) -> None:
self.ignore = ignore
super().__init__(data_root=data_root, nproc=nproc)
super().__init__(**kwargs)

def parse_file(self, file: Tuple, split: str) -> Tuple:
"""Parse single annotation.
def parse_file(self, img_path: str, ann_path: str) -> Tuple:
"""Convert annotation for a single image.
Args:
file (tuple): Tuple of (img_file, json_file).
split (str): Split of the file. For train split, xml file will be
used. For test split, txt file will be used.
img_path (str): The path of image.
ann_path (str): The path of annotation.
Returns:
tuple: Tuple of (img_file, instances).
Tuple: A tuple of (img_path, instance).
- img_path (str): The path of image file, which can be read
directly by opencv.
- instance: instance is a list of dict containing parsed
annotations, which should contain the following keys:
- 'poly' or 'box' (textdet or textspotting)
- 'text' (textspotting or textrecog)
- 'ignore' (all task)
Examples:
An example of returned values:
>>> ('imgs/train/xxx.jpg',
>>> dict(
>>> poly=[[[0, 1], [1, 1], [1, 0], [0, 0]]],
>>> text='hello',
>>> ignore=False)
>>> )
"""
img_dir, anno_dir = file
if split == 'train':
instances = self.load_xml_info(anno_dir)
elif split == 'test':
instances = self.load_txt_info(anno_dir)
return img_dir, instances

if self.split == 'train':
instances = self.load_xml_info(ann_path)
elif self.split == 'test':
instances = self.load_txt_info(ann_path)
return img_path, instances

def load_txt_info(self, anno_dir: str) -> List:
"""Load the annotation of the SCUT-CTW dataset (test split).
Expand All @@ -64,7 +77,7 @@ def load_txt_info(self, anno_dir: str) -> List:
poly = np.array(xy).reshape(-1).tolist()
text = strs[28][4:]
instances.append(
dict(poly=poly, text=text, ignore=text in self.ignore))
dict(poly=poly, text=text, ignore=text == self.ignore))
return instances

def load_xml_info(self, anno_dir: str) -> List:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
import tempfile
import unittest

from mmocr.datasets.preparers.parsers import CTW1500AnnParser
from mmocr.utils import list_to_file


class TestCTW1500AnnParser(unittest.TestCase):

def setUp(self) -> None:
self.root = tempfile.TemporaryDirectory()

def _create_dummy_ctw1500_det(self):
fake_train_anno = [
'<Annotations>',
' <image file="0200.jpg">',
' <box height="197" left="131" top="49" width="399">',
' <label>OLATHE</label>',
' <segs>131,58,208,49,279,56,346,76,412,101,473,141,530,192,510,246,458,210,405,175,350,151,291,137,228,133,165,134</segs>', # noqa: E501
' <pts x="183" y="95" />',
' <pts x="251" y="89" />',
' <pts x="322" y="107" />',
' <pts x="383" y="124" />',
' <pts x="441" y="161" />',
' <pts x="493" y="201" />',
' </box>',
' </image>',
'</Annotations>',
]
train_ann_file = osp.join(self.root.name, 'ctw1500_train.xml')
list_to_file(train_ann_file, fake_train_anno)

fake_test_anno = [
'48,84,61,79,75,73,88,68,102,74,116,79,130,84,135,73,119,67,104,60,89,56,74,61,59,67,45,73,#######', # noqa: E501
'51,137,58,137,66,137,74,137,82,137,90,137,98,137,98,119,90,119,82,119,74,119,66,119,58,119,50,119,####E-313', # noqa: E501
'41,155,49,155,57,155,65,155,73,155,81,155,89,155,87,136,79,136,71,136,64,136,56,136,48,136,41,137,#######', # noqa: E501
'41,193,57,193,74,194,90,194,107,195,123,195,140,196,146,168,128,167,110,167,92,167,74,166,56,166,39,166,####F.D.N.Y.', # noqa: E501
]
test_ann_file = osp.join(self.root.name, 'ctw1500_test.txt')
list_to_file(test_ann_file, fake_test_anno)
return (osp.join(self.root.name,
'ctw1500.jpg'), train_ann_file, test_ann_file)

def test_textdet_parsers(self):
parser = CTW1500AnnParser(split='train')
img_path, train_file, test_file = self._create_dummy_ctw1500_det()
img_path, instances = parser.parse_file(img_path, train_file)
self.assertEqual(img_path, osp.join(self.root.name, 'ctw1500.jpg'))
self.assertEqual(len(instances), 1)
self.assertEqual(instances[0]['text'], 'OLATHE')
self.assertEqual(instances[0]['poly'], [
131, 58, 208, 49, 279, 56, 346, 76, 412, 101, 473, 141, 530, 192,
510, 246, 458, 210, 405, 175, 350, 151, 291, 137, 228, 133, 165,
134
])
self.assertEqual(instances[0]['ignore'], False)

parser = CTW1500AnnParser(split='test')
img_path, instances = parser.parse_file(img_path, test_file)
self.assertEqual(img_path, osp.join(self.root.name, 'ctw1500.jpg'))
self.assertEqual(len(instances), 4)
self.assertEqual(instances[0]['ignore'], True)
self.assertEqual(instances[1]['text'], 'E-313')
self.assertEqual(instances[3]['poly'], [
41, 193, 57, 193, 74, 194, 90, 194, 107, 195, 123, 195, 140, 196,
146, 168, 128, 167, 110, 167, 92, 167, 74, 166, 56, 166, 39, 166
])

def tearDown(self) -> None:
self.root.cleanup()

0 comments on commit 00349d1

Please sign in to comment.