Skip to content

Commit

Permalink
Merge pull request #134 from breezedeus/dev
Browse files Browse the repository at this point in the history
bugfixes
  • Loading branch information
breezedeus authored Jul 18, 2024
2 parents c4271c7 + f8efdc6 commit 4438d9b
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 19 deletions.
14 changes: 14 additions & 0 deletions docs/RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Release Notes

# Update 2024.07.18: **V1.1.1.2** Released

Major Changes:

* fix bugs:
* https://github.com/breezedeus/Pix2Text/issues/129
* https://github.com/breezedeus/Pix2Text/issues/116

主要变更:

* 修复 bugs:
* https://github.com/breezedeus/Pix2Text/issues/129
* https://github.com/breezedeus/Pix2Text/issues/116

# Update 2024.06.24: **V1.1.1.1** Released

Major Changes:
Expand Down
2 changes: 1 addition & 1 deletion pix2text/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# [Pix2Text](https://github.com/breezedeus/pix2text): an Open-Source Alternative to Mathpix.
# Copyright (C) 2022-2024, [Breezedeus](https://www.breezedeus.com).

__version__ = '1.1.1.1'
__version__ = '1.1.1.2'
15 changes: 6 additions & 9 deletions pix2text/doc_xl_layout/doc_xl_layout_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import shutil
from collections import defaultdict
from copy import deepcopy
from copy import deepcopy, copy
from pathlib import Path
import logging
from typing import Union, List, Dict, Any, Optional
Expand Down Expand Up @@ -110,16 +110,13 @@ def __init__(

@classmethod
def from_config(cls, configs: Optional[dict] = None, device: str = None, **kwargs):
configs = configs or {}
configs = copy(configs or {})
device = select_device(device)
# configs['device'] = device if device != 'mps' else 'cpu'
model_fp = configs.pop('model_fp', None)
root = configs.pop('root', data_dir())
configs.pop('device', None)

return cls(
device=device,
model_fp=configs.get('model_fp', None),
root=configs.get('root', data_dir()),
**configs,
)
return cls(device=device, model_fp=model_fp, root=root, **configs)

def _prepare_model_files(self, root, model_info):
model_root_dir = Path(root).expanduser() / MODEL_VERSION
Expand Down
8 changes: 7 additions & 1 deletion pix2text/pix_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
layout_parser: Optional[LayoutParser] = None,
text_formula_ocr: Optional[TextFormulaOCR] = None,
table_ocr: Optional[TableOCR] = None,
enable_formula: bool = True,
**kwargs,
):
"""
Expand All @@ -49,6 +50,7 @@ def __init__(
layout_parser (LayoutParser): The layout parser object; default value is `None`, which means to create a default one
text_formula_ocr (TextFormulaOCR): The text and formula OCR object; default value is `None`, which means to create a default one
table_ocr (TableOCR): The table OCR object; default value is `None`, which means not to recognize tables
enable_formula (bool): Whether to enable formula recognition; default value is `True`
**kwargs (dict): Other arguments, currently not used
"""
if layout_parser is None:
Expand All @@ -58,11 +60,12 @@ def __init__(
if text_formula_ocr is None:
device = select_device(None)
text_formula_ocr = TextFormulaOCR.from_config(
None, enable_formula=True, device=device
None, enable_formula=enable_formula, device=device
)
self.layout_parser = layout_parser
self.text_formula_ocr = text_formula_ocr
self.table_ocr = table_ocr
self.enable_formula = enable_formula

@classmethod
def from_config(
Expand Down Expand Up @@ -115,6 +118,7 @@ def from_config(
layout_parser=layout_parser,
text_formula_ocr=text_formula_ocr,
table_ocr=table_ocr,
enable_formula=enable_formula,
**kwargs,
)

Expand Down Expand Up @@ -273,6 +277,8 @@ def recognize_page(
crop_patch = img0.crop(box)
crop_width, _ = crop_patch.size
score = 1.0
if not self.enable_formula and image_type == ElementType.FORMULA:
image_type = ElementType.TEXT
if image_type in (ElementType.TEXT, ElementType.TITLE):
_resized_shape = resized_shape
while crop_width > 1.5 * _resized_shape and _resized_shape < 2048:
Expand Down
24 changes: 18 additions & 6 deletions pix2text/text_formula_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import re
from itertools import chain
from pathlib import Path
from typing import Dict, Any, Optional, Union, List, Sequence
from typing import Dict, Any, Optional, Union, List
from copy import copy, deepcopy

from PIL import Image
Expand Down Expand Up @@ -69,8 +69,19 @@ def __init__(
mfd: Optional[Any] = None,
latex_ocr: Optional[LatexOCR] = None,
spellchecker: Optional[SpellChecker] = None,
enable_formula: bool = True,
**kwargs,
):
"""
Recognize text and formula from an image.
Args:
text_ocr (Optional[TextOcrEngine]): Text OCR engine; defaults to `None`.
mfd (Optional[Any]): Math Formula Detector; defaults to `None`.
latex_ocr (Optional[LatexOCR]): Latex OCR engine; defaults to `None`.
spellchecker (Optional[SpellChecker]): Spell Checker; defaults to `None`.
enable_formula (bool): Whether to enable the capability of Math Formula Detection (MFD) and Recognition (MFR); defaults to `True`.
**kwargs ():
"""
if text_ocr is None:
text_config = deepcopy(DEFAULT_CONFIGS['text'])
device = select_device(device=None)
Expand All @@ -92,6 +103,7 @@ def __init__(
self.mfd = mfd
self.latex_ocr = latex_ocr
self.spellchecker = spellchecker
self.enable_formula = enable_formula

@classmethod
def from_config(
Expand Down Expand Up @@ -146,6 +158,7 @@ def from_config(
mfd=mfd,
latex_ocr=latex_ocr,
spellchecker=spellchecker,
enable_formula=enable_formula,
**kwargs,
)

Expand Down Expand Up @@ -227,11 +240,8 @@ def recognize(
analyzer_outs = []
crop_patches = []
mf_results = []
if (
kwargs.get('contain_formula', True)
and self.mfd is not None
and self.latex_ocr is not None
):
enable_formula = kwargs.get('contain_formula', True) and self.enable_formula
if enable_formula and self.mfd is not None and self.latex_ocr is not None:
analyzer_outs = self.mfd(img0.copy(), resized_shape=resized_shape)
for mf_box_info in analyzer_outs:
box = mf_box_info['box']
Expand Down Expand Up @@ -543,6 +553,8 @@ def recognize_formula(
* `score`: The confidence score [0, 1]; the higher, the more confident
"""
if not self.enable_formula:
raise RuntimeError('Formula recognition is not enabled')
if self.latex_ocr is None:
raise RuntimeError('`latex_ocr` model MUST NOT be None')
outs = self.latex_ocr.recognize(
Expand Down
4 changes: 2 additions & 2 deletions tests/test_pix2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_recognize_pdf():
'layout': {'scores_thresh': 0.45},
'text_formula': text_formula_config,
}
p2t = Pix2Text.from_config(total_configs=total_config)
p2t = Pix2Text.from_config(total_configs=total_config, enable_formula=True)
out_md = p2t.recognize_pdf(
img_fp,
page_numbers=[0, 7, 8],
Expand Down Expand Up @@ -173,6 +173,6 @@ def test_example_formula():

def test_example_text():
img_fp = './docs/examples/general.jpg'
p2t = Pix2Text()
p2t = Pix2Text(enable_formula=False)
outs = p2t.recognize_text(img_fp)
print(outs)

0 comments on commit 4438d9b

Please sign in to comment.