Merge pull request #134 from breezedeus/dev

bugfixes
breezedeus · Jul 18, 2024 · 4438d9b · 4438d9b
2 parents c4271c7 + f8efdc6
commit 4438d9b
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 19 deletions.
diff --git a/docs/RELEASE.md b/docs/RELEASE.md
@@ -1,5 +1,19 @@
 # Release Notes
 
+# Update 2024.07.18: **V1.1.1.2** Released
+
+Major Changes:
+
+* fix bugs：
+    * https://github.com/breezedeus/Pix2Text/issues/129
+    * https://github.com/breezedeus/Pix2Text/issues/116
+
+主要变更：
+
+* 修复 bugs：
+  * https://github.com/breezedeus/Pix2Text/issues/129
+  * https://github.com/breezedeus/Pix2Text/issues/116
+
 # Update 2024.06.24: **V1.1.1.1** Released
 
 Major Changes:

diff --git a/pix2text/__version__.py b/pix2text/__version__.py
@@ -2,4 +2,4 @@
 # [Pix2Text](https://github.com/breezedeus/pix2text): an Open-Source Alternative to Mathpix.
 # Copyright (C) 2022-2024, [Breezedeus](https://www.breezedeus.com).
 
-__version__ = '1.1.1.1'
+__version__ = '1.1.1.2'
diff --git a/pix2text/doc_xl_layout/doc_xl_layout_parser.py b/pix2text/doc_xl_layout/doc_xl_layout_parser.py
@@ -4,7 +4,7 @@
 import os
 import shutil
 from collections import defaultdict
-from copy import deepcopy
+from copy import deepcopy, copy
 from pathlib import Path
 import logging
 from typing import Union, List, Dict, Any, Optional
@@ -110,16 +110,13 @@ def __init__(
 
     @classmethod
     def from_config(cls, configs: Optional[dict] = None, device: str = None, **kwargs):
-        configs = configs or {}
+        configs = copy(configs or {})
         device = select_device(device)
-        # configs['device'] = device if device != 'mps' else 'cpu'
+        model_fp = configs.pop('model_fp', None)
+        root = configs.pop('root', data_dir())
+        configs.pop('device', None)
 
-        return cls(
-            device=device,
-            model_fp=configs.get('model_fp', None),
-            root=configs.get('root', data_dir()),
-            **configs,
-        )
+        return cls(device=device, model_fp=model_fp, root=root, **configs)
 
     def _prepare_model_files(self, root, model_info):
         model_root_dir = Path(root).expanduser() / MODEL_VERSION

diff --git a/pix2text/pix_to_text.py b/pix2text/pix_to_text.py
@@ -41,6 +41,7 @@ def __init__(
         layout_parser: Optional[LayoutParser] = None,
         text_formula_ocr: Optional[TextFormulaOCR] = None,
         table_ocr: Optional[TableOCR] = None,
+        enable_formula: bool = True,
         **kwargs,
     ):
         """
@@ -49,6 +50,7 @@ def __init__(
             layout_parser (LayoutParser): The layout parser object; default value is `None`, which means to create a default one
             text_formula_ocr (TextFormulaOCR): The text and formula OCR object; default value is `None`, which means to create a default one
             table_ocr (TableOCR): The table OCR object; default value is `None`, which means not to recognize tables
+            enable_formula (bool): Whether to enable formula recognition; default value is `True`
             **kwargs (dict): Other arguments, currently not used
         """
         if layout_parser is None:
@@ -58,11 +60,12 @@ def __init__(
         if text_formula_ocr is None:
             device = select_device(None)
             text_formula_ocr = TextFormulaOCR.from_config(
-                None, enable_formula=True, device=device
+                None, enable_formula=enable_formula, device=device
             )
         self.layout_parser = layout_parser
         self.text_formula_ocr = text_formula_ocr
         self.table_ocr = table_ocr
+        self.enable_formula = enable_formula
 
     @classmethod
     def from_config(
@@ -115,6 +118,7 @@ def from_config(
             layout_parser=layout_parser,
             text_formula_ocr=text_formula_ocr,
             table_ocr=table_ocr,
+            enable_formula=enable_formula,
             **kwargs,
         )
 
@@ -273,6 +277,8 @@ def recognize_page(
             crop_patch = img0.crop(box)
             crop_width, _ = crop_patch.size
             score = 1.0
+            if not self.enable_formula and image_type == ElementType.FORMULA:
+                image_type = ElementType.TEXT
             if image_type in (ElementType.TEXT, ElementType.TITLE):
                 _resized_shape = resized_shape
                 while crop_width > 1.5 * _resized_shape and _resized_shape < 2048:

diff --git a/pix2text/text_formula_ocr.py b/pix2text/text_formula_ocr.py
@@ -6,7 +6,7 @@
 import re
 from itertools import chain
 from pathlib import Path
-from typing import Dict, Any, Optional, Union, List, Sequence
+from typing import Dict, Any, Optional, Union, List
 from copy import copy, deepcopy
 
 from PIL import Image
@@ -69,8 +69,19 @@ def __init__(
         mfd: Optional[Any] = None,
         latex_ocr: Optional[LatexOCR] = None,
         spellchecker: Optional[SpellChecker] = None,
+        enable_formula: bool = True,
         **kwargs,
     ):
+        """
+        Recognize text and formula from an image.
+        Args:
+            text_ocr (Optional[TextOcrEngine]): Text OCR engine; defaults to `None`.
+            mfd (Optional[Any]): Math Formula Detector; defaults to `None`.
+            latex_ocr (Optional[LatexOCR]): Latex OCR engine; defaults to `None`.
+            spellchecker (Optional[SpellChecker]): Spell Checker; defaults to `None`.
+            enable_formula (bool): Whether to enable the capability of Math Formula Detection (MFD) and Recognition (MFR); defaults to `True`.
+            **kwargs ():
+        """
         if text_ocr is None:
             text_config = deepcopy(DEFAULT_CONFIGS['text'])
             device = select_device(device=None)
@@ -92,6 +103,7 @@ def __init__(
         self.mfd = mfd
         self.latex_ocr = latex_ocr
         self.spellchecker = spellchecker
+        self.enable_formula = enable_formula
 
     @classmethod
     def from_config(
@@ -146,6 +158,7 @@ def from_config(
             mfd=mfd,
             latex_ocr=latex_ocr,
             spellchecker=spellchecker,
+            enable_formula=enable_formula,
             **kwargs,
         )
 
@@ -227,11 +240,8 @@ def recognize(
         analyzer_outs = []
         crop_patches = []
         mf_results = []
-        if (
-            kwargs.get('contain_formula', True)
-            and self.mfd is not None
-            and self.latex_ocr is not None
-        ):
+        enable_formula = kwargs.get('contain_formula', True) and self.enable_formula
+        if enable_formula and self.mfd is not None and self.latex_ocr is not None:
             analyzer_outs = self.mfd(img0.copy(), resized_shape=resized_shape)
             for mf_box_info in analyzer_outs:
                 box = mf_box_info['box']
@@ -543,6 +553,8 @@ def recognize_formula(
                     * `score`: The confidence score [0, 1]; the higher, the more confident
 
         """
+        if not self.enable_formula:
+            raise RuntimeError('Formula recognition is not enabled')
         if self.latex_ocr is None:
             raise RuntimeError('`latex_ocr` model MUST NOT be None')
         outs = self.latex_ocr.recognize(

diff --git a/tests/test_pix2text.py b/tests/test_pix2text.py
@@ -37,7 +37,7 @@ def test_recognize_pdf():
         'layout': {'scores_thresh': 0.45},
         'text_formula': text_formula_config,
     }
-    p2t = Pix2Text.from_config(total_configs=total_config)
+    p2t = Pix2Text.from_config(total_configs=total_config, enable_formula=True)
     out_md = p2t.recognize_pdf(
         img_fp,
         page_numbers=[0, 7, 8],
@@ -173,6 +173,6 @@ def test_example_formula():
 
 def test_example_text():
     img_fp = './docs/examples/general.jpg'
-    p2t = Pix2Text()
+    p2t = Pix2Text(enable_formula=False)
     outs = p2t.recognize_text(img_fp)
     print(outs)