From b825267887d268ea5e10206e3b6dfa46b4d57daa Mon Sep 17 00:00:00 2001 From: breezedeus Date: Thu, 18 Jul 2024 13:29:36 +0800 Subject: [PATCH 1/2] bugfixes --- .../doc_xl_layout/doc_xl_layout_parser.py | 15 +++++------- pix2text/pix_to_text.py | 8 ++++++- pix2text/text_formula_ocr.py | 24 ++++++++++++++----- tests/test_pix2text.py | 4 ++-- 4 files changed, 33 insertions(+), 18 deletions(-) diff --git a/pix2text/doc_xl_layout/doc_xl_layout_parser.py b/pix2text/doc_xl_layout/doc_xl_layout_parser.py index 392190e..7439ff5 100644 --- a/pix2text/doc_xl_layout/doc_xl_layout_parser.py +++ b/pix2text/doc_xl_layout/doc_xl_layout_parser.py @@ -4,7 +4,7 @@ import os import shutil from collections import defaultdict -from copy import deepcopy +from copy import deepcopy, copy from pathlib import Path import logging from typing import Union, List, Dict, Any, Optional @@ -110,16 +110,13 @@ def __init__( @classmethod def from_config(cls, configs: Optional[dict] = None, device: str = None, **kwargs): - configs = configs or {} + configs = copy(configs or {}) device = select_device(device) - # configs['device'] = device if device != 'mps' else 'cpu' + model_fp = configs.pop('model_fp', None) + root = configs.pop('root', data_dir()) + configs.pop('device', None) - return cls( - device=device, - model_fp=configs.get('model_fp', None), - root=configs.get('root', data_dir()), - **configs, - ) + return cls(device=device, model_fp=model_fp, root=root, **configs) def _prepare_model_files(self, root, model_info): model_root_dir = Path(root).expanduser() / MODEL_VERSION diff --git a/pix2text/pix_to_text.py b/pix2text/pix_to_text.py index c02310e..37584a1 100644 --- a/pix2text/pix_to_text.py +++ b/pix2text/pix_to_text.py @@ -41,6 +41,7 @@ def __init__( layout_parser: Optional[LayoutParser] = None, text_formula_ocr: Optional[TextFormulaOCR] = None, table_ocr: Optional[TableOCR] = None, + enable_formula: bool = True, **kwargs, ): """ @@ -49,6 +50,7 @@ def __init__( layout_parser (LayoutParser): The layout parser object; default value is `None`, which means to create a default one text_formula_ocr (TextFormulaOCR): The text and formula OCR object; default value is `None`, which means to create a default one table_ocr (TableOCR): The table OCR object; default value is `None`, which means not to recognize tables + enable_formula (bool): Whether to enable formula recognition; default value is `True` **kwargs (dict): Other arguments, currently not used """ if layout_parser is None: @@ -58,11 +60,12 @@ def __init__( if text_formula_ocr is None: device = select_device(None) text_formula_ocr = TextFormulaOCR.from_config( - None, enable_formula=True, device=device + None, enable_formula=enable_formula, device=device ) self.layout_parser = layout_parser self.text_formula_ocr = text_formula_ocr self.table_ocr = table_ocr + self.enable_formula = enable_formula @classmethod def from_config( @@ -115,6 +118,7 @@ def from_config( layout_parser=layout_parser, text_formula_ocr=text_formula_ocr, table_ocr=table_ocr, + enable_formula=enable_formula, **kwargs, ) @@ -273,6 +277,8 @@ def recognize_page( crop_patch = img0.crop(box) crop_width, _ = crop_patch.size score = 1.0 + if not self.enable_formula and image_type == ElementType.FORMULA: + image_type = ElementType.TEXT if image_type in (ElementType.TEXT, ElementType.TITLE): _resized_shape = resized_shape while crop_width > 1.5 * _resized_shape and _resized_shape < 2048: diff --git a/pix2text/text_formula_ocr.py b/pix2text/text_formula_ocr.py index 66d0b89..2180ad1 100644 --- a/pix2text/text_formula_ocr.py +++ b/pix2text/text_formula_ocr.py @@ -6,7 +6,7 @@ import re from itertools import chain from pathlib import Path -from typing import Dict, Any, Optional, Union, List, Sequence +from typing import Dict, Any, Optional, Union, List from copy import copy, deepcopy from PIL import Image @@ -69,8 +69,19 @@ def __init__( mfd: Optional[Any] = None, latex_ocr: Optional[LatexOCR] = None, spellchecker: Optional[SpellChecker] = None, + enable_formula: bool = True, **kwargs, ): + """ + Recognize text and formula from an image. + Args: + text_ocr (Optional[TextOcrEngine]): Text OCR engine; defaults to `None`. + mfd (Optional[Any]): Math Formula Detector; defaults to `None`. + latex_ocr (Optional[LatexOCR]): Latex OCR engine; defaults to `None`. + spellchecker (Optional[SpellChecker]): Spell Checker; defaults to `None`. + enable_formula (bool): Whether to enable the capability of Math Formula Detection (MFD) and Recognition (MFR); defaults to `True`. + **kwargs (): + """ if text_ocr is None: text_config = deepcopy(DEFAULT_CONFIGS['text']) device = select_device(device=None) @@ -92,6 +103,7 @@ def __init__( self.mfd = mfd self.latex_ocr = latex_ocr self.spellchecker = spellchecker + self.enable_formula = enable_formula @classmethod def from_config( @@ -146,6 +158,7 @@ def from_config( mfd=mfd, latex_ocr=latex_ocr, spellchecker=spellchecker, + enable_formula=enable_formula, **kwargs, ) @@ -227,11 +240,8 @@ def recognize( analyzer_outs = [] crop_patches = [] mf_results = [] - if ( - kwargs.get('contain_formula', True) - and self.mfd is not None - and self.latex_ocr is not None - ): + enable_formula = kwargs.get('contain_formula', True) and self.enable_formula + if enable_formula and self.mfd is not None and self.latex_ocr is not None: analyzer_outs = self.mfd(img0.copy(), resized_shape=resized_shape) for mf_box_info in analyzer_outs: box = mf_box_info['box'] @@ -543,6 +553,8 @@ def recognize_formula( * `score`: The confidence score [0, 1]; the higher, the more confident """ + if not self.enable_formula: + raise RuntimeError('Formula recognition is not enabled') if self.latex_ocr is None: raise RuntimeError('`latex_ocr` model MUST NOT be None') outs = self.latex_ocr.recognize( diff --git a/tests/test_pix2text.py b/tests/test_pix2text.py index 7482eff..f8721f4 100644 --- a/tests/test_pix2text.py +++ b/tests/test_pix2text.py @@ -37,7 +37,7 @@ def test_recognize_pdf(): 'layout': {'scores_thresh': 0.45}, 'text_formula': text_formula_config, } - p2t = Pix2Text.from_config(total_configs=total_config) + p2t = Pix2Text.from_config(total_configs=total_config, enable_formula=True) out_md = p2t.recognize_pdf( img_fp, page_numbers=[0, 7, 8], @@ -173,6 +173,6 @@ def test_example_formula(): def test_example_text(): img_fp = './docs/examples/general.jpg' - p2t = Pix2Text() + p2t = Pix2Text(enable_formula=False) outs = p2t.recognize_text(img_fp) print(outs) From f8efdc65e6e9eb3463087fafeb675a37c0ed7a2d Mon Sep 17 00:00:00 2001 From: breezedeus Date: Thu, 18 Jul 2024 13:30:03 +0800 Subject: [PATCH 2/2] bump version --- docs/RELEASE.md | 14 ++++++++++++++ pix2text/__version__.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/RELEASE.md b/docs/RELEASE.md index d3a8953..c6f8d58 100644 --- a/docs/RELEASE.md +++ b/docs/RELEASE.md @@ -1,5 +1,19 @@ # Release Notes +# Update 2024.07.18: **V1.1.1.2** Released + +Major Changes: + +* fix bugs: + * https://github.com/breezedeus/Pix2Text/issues/129 + * https://github.com/breezedeus/Pix2Text/issues/116 + +主要变更: + +* 修复 bugs: + * https://github.com/breezedeus/Pix2Text/issues/129 + * https://github.com/breezedeus/Pix2Text/issues/116 + # Update 2024.06.24: **V1.1.1.1** Released Major Changes: diff --git a/pix2text/__version__.py b/pix2text/__version__.py index 97f1c6d..4ab5f76 100644 --- a/pix2text/__version__.py +++ b/pix2text/__version__.py @@ -2,4 +2,4 @@ # [Pix2Text](https://github.com/breezedeus/pix2text): an Open-Source Alternative to Mathpix. # Copyright (C) 2022-2024, [Breezedeus](https://www.breezedeus.com). -__version__ = '1.1.1.1' +__version__ = '1.1.1.2'