Merge pull request useblocks#15 from useblocks/modif-annotation

Flag to disable annotation extraction
swarajdash · Apr 26, 2022 · d9df43a · d9df43a
2 parents d7424d5 + 6f87bee
commit d9df43a
Show file tree

Hide file tree

Showing 9 changed files with 224 additions and 202 deletions.
diff --git a/.gitignore b/.gitignore
@@ -270,3 +270,7 @@ pip-selfcheck.json
 .vscode/launch.json
 !.vscode/launch.json.default
 !.vscode/settings.json.default
+
+# Ignore the output from libpdf
+visual_debug*
+out*
diff --git a/docs/contents/changelog.rst b/docs/contents/changelog.rst
@@ -22,6 +22,12 @@ Fixed
 
 - Fixed catalog outline title resolve issue (`PR #10 <https://github.com/useblocks/libpdf/pull/10>`_)
 
+Added
+~~~~~
+
+- Added flag ``--no_annotations`` to exclude annotation extraction from the catalog to speed up extraction
+  (`PR #15 <https://github.com/useblocks/libpdf/pull/15>`_)
+
 __ https://github.com/useblocks/libpdf/releases/tag/v0.0.1
 
 `0.0.1`__ - 2020-06-30

diff --git a/libpdf/catalog.py b/libpdf/catalog.py
@@ -5,11 +5,13 @@
 
 from libpdf.log import logging_needed
 from libpdf.parameters import ANNO_X_TOLERANCE, ANNO_Y_TOLERANCE
+from libpdf.progress import bar_format_lvl2, tqdm
 from libpdf.utils import decode_title, to_pdfplumber_bbox
 
 from pdfminer.pdftypes import PDFObjRef
 from pdfminer.psparser import PSLiteral
 
+
 LOG = logging.getLogger(__name__)
 
 
@@ -500,7 +502,9 @@ def annotation_dict_extraction(pdf):
 
     annotation_page_map = {}
 
-    for idx_page, page in enumerate(pdf.pages):
+    for idx_page, page in enumerate(
+        tqdm(pdf.pages, desc='###### Extracting annotations', unit='pages', bar_format=bar_format_lvl2()),
+    ):
         if logging_needed(idx_page, len(pdf.pages)):
             LOG.debug('Catalog extraction: annotations page %s of %s', idx_page + 1, len(pdf.pages))
 
@@ -639,7 +643,7 @@ def _resolve_pdf_obj_refs(
     return resolved_dict, resolved_list
 
 
-def extract_catalog(pdf):
+def extract_catalog(pdf, no_annotations: bool):
     """
     Extract catalog document of a PDF.
 
@@ -658,8 +662,13 @@ def extract_catalog(pdf):
     # resolved_catalog, _ = _resolve_pdf_obj_refs(pdf.doc.catalog, resolved_objects)
     # del resolved_catalog  # denote it is not yet used
 
-    # extract annotation (link source) and store in the dict by pages for further process of links on texts in extract()
-    ann_dict = annotation_dict_extraction(pdf)
+    if no_annotations:
+        ann_dict = None
+        LOG.info('Catalog extraction: annotations is excluded')
+    else:
+        # extract annotation (link source) and store in the dict by pages for further process of links
+        # on texts in extract()
+        ann_dict = annotation_dict_extraction(pdf)
 
     # extract name destination (link target)and store in the dict for further process in extract()
     des_dict = get_named_destination(pdf)

diff --git a/libpdf/core.py b/libpdf/core.py
@@ -30,6 +30,7 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
     output_path: str = None,
     save_figures: bool = False,
     figure_dir: str = None,
+    no_annotations: bool = False,
     no_chapters: bool = False,
     no_paragraphs: bool = False,
     no_tables: bool = False,
@@ -60,6 +61,7 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
     :param figure_dir: output directory for extracted figures; if it does not exist, it will be created
     :param output_format: only relevant for CLI, allowed values are json, yaml or stdout
     :param output_path: only relevant for CLI, path to the output file for output_formats json or yaml
+    :param no_annotations: flag triggering the exclusion of annotations from pdf catalog
     :param no_chapters: flag triggering the exclusion of chapters (flat structure of elements)
     :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
     :param no_tables: flag triggering the exclusion of tables
@@ -106,6 +108,7 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
         LOG.info('Page range: [%s]', 'all' if not pages else ','.join(str(x) for x in pages))
         LOG.info('Page crop: %s', 'not cropped' if not page_crop else ' '.join(str(x) for x in page_crop))
         LOG.info('Smart page crop: %s', 'on' if smart_page_crop else 'off')
+        LOG.info('Extract annotations: %s', 'no' if no_annotations else 'yes')
         LOG.info('Extract chapters: %s', 'no' if no_chapters else 'yes')
         LOG.info('Extract paragraphs: %s', 'no' if no_paragraphs else 'yes')
         LOG.info('Extract tables: %s', 'no' if no_tables else 'yes')
@@ -118,6 +121,7 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
                 smart_page_crop,
                 save_figures,
                 figure_dir,
+                no_annotations,
                 no_chapters,
                 no_paragraphs,
                 no_tables,
@@ -153,14 +157,15 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
     return None
 
 
-def main_api(  # pylint: disable=too-many-arguments
+def main_api(  # pylint: disable=too-many-arguments, too-many-locals
     pdf: str,
     verbose: int = 1,  # log level WARNING for library usage is considered a good compromise as a default
     page_range: str = None,
     page_crop: Tuple[float, float, float, float] = None,
     smart_page_crop: bool = False,
     save_figures: bool = False,
     figure_dir: str = 'figures',
+    no_annotations: bool = False,
     no_chapters: bool = False,
     no_paragraphs: bool = False,
     no_tables: bool = False,
@@ -185,6 +190,7 @@ def main_api(  # pylint: disable=too-many-arguments
     :param smart_page_crop: see description in function core.main()
     :param save_figures: flag triggering the export of figures to the figure_dir
     :param figure_dir: output directory for extracted figures; if it does not exist, it will be created
+    :param no_annotations: flag triggering the exclusion of annotations from pdf catalog
     :param no_chapters: flag triggering the exclusion of chapters (resulting in a flat list of elements)
     :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
     :param no_tables: flag triggering the exclusion of tables
@@ -227,6 +233,7 @@ def main_api(  # pylint: disable=too-many-arguments
         smart_page_crop=smart_page_crop,
         save_figures=save_figures,
         figure_dir=figure_dir,
+        no_annotations=no_annotations,
         no_chapters=no_chapters,
         no_paragraphs=no_paragraphs,
         no_tables=no_tables,
@@ -409,6 +416,13 @@ def handle_parse_result(self, ctx, opts, args):
     show_default=True,
     help='Output directory for extracted figures; if it does not exist, it will be created',
 )
+@click.option(
+    '--no-annotations',
+    is_flag=True,
+    show_default=True,
+    help='Do not extract annotations from catalog. All PDF-internal links will not be resolved.'
+    ' Chapter detection however will work',
+)
 @click.option(
     '--no-chapters',
     is_flag=True,

diff --git a/libpdf/extract.py b/libpdf/extract.py
@@ -51,12 +51,13 @@ def folded_str_representer(dumper, text):
 yaml.add_representer(FoldedStr, folded_str_representer)
 
 
-def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-statements
+def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-statements, too-many-arguments
     pdf_path: str,
     pages: Optional[List[int]],
     smart_page_crop: bool,
     save_figures: bool,
     figure_dir: Optional[str],
+    no_annotations: bool,
     no_chapters: bool,
     no_paragraphs: bool,
     no_tables: bool,
@@ -71,6 +72,7 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
     :param smart_page_crop: see description in function core.main()
     :param save_figures: flag triggering the export of figures to the figure_dir
     :param figure_dir: output directory for extracted figures
+    :param no_annotations: flag triggering the exclusion of annotations from pdf catalog
     :param no_chapters: flag triggering the exclusion of chapters (flat structure of elements)
     :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
     :param no_tables: flag triggering the exclusion of tables
@@ -138,7 +140,7 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
         overall_pbar.update(1)
 
         # extract annotations, name destinations and outline
-        extract_catalog(pdf)
+        extract_catalog(pdf, no_annotations)
         overall_pbar.update(10)
 
         # In figure_dict, figures are sorted by pages and y coordinates.

diff --git a/libpdf/process.py b/libpdf/process.py
@@ -474,7 +474,7 @@ def find_target_id(link: Link, pages_list: List[Page], src_element: Element) ->
 
             text = str(src_element)
             text_shortened = (text[:60] + '..') if len(text) > 60 else text
-            LOG.warning(
+            LOG.debug(
                 'The link "%s" on page %s could not be resolved to a libpdf element; replacing it with the raw '
                 'target page coordinate %s',
                 text_shortened,

diff --git a/libpdf/textbox.py b/libpdf/textbox.py
@@ -79,6 +79,7 @@ def extract_paragraphs_chapters(
         LOG.info('Excluding chapters extraction')
     else:
         if catalog['outline']:
+            LOG.info('Extracting chapters ...')
             chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf)
 
     paragraph_list = []
@@ -165,7 +166,12 @@ def render_chapters(  # pylint: disable=too-many-branches, too-many-locals
                 chapters_sorted_by_page[chapter['position']['page']] = []
             chapters_sorted_by_page[chapter['position']['page']].append(chapter)
 
-    for page_number, chapters in chapters_sorted_by_page.items():
+    for page_number, chapters in tqdm(
+        chapters_sorted_by_page.items(),
+        desc='###### Extracting chapters',
+        unit='pages',
+        bar_format=bar_format_lvl2(),
+    ):
         if page_number - 1 in page_lt_textboxes_filtered:
             lt_textboxes = page_lt_textboxes_filtered[page_number - 1]
             for page in page_list:
@@ -477,7 +483,12 @@ def render_paragraphs(  # pylint: disable=too-many-branches
     paragraph_list = []
     paragraph_id = 1
 
-    for page_index, lt_textboxes in page_lt_textboxes_filtered.items():
+    for page_index, lt_textboxes in tqdm(
+        page_lt_textboxes_filtered.items(),
+        desc='###### Extracting paragraphs',
+        unit='pages',
+        bar_format=bar_format_lvl2(),
+    ):
         # add lt_textbox to a list of paragraphs
         for lt_textbox in lt_textboxes:
             # get position of lt_textbox
@@ -848,7 +859,6 @@ def pdfminer_get_lt_textboxes(pdf) -> Dict[int, List[LTTextBox]]:
             desc='###### Extracting layout',
             unit='pages',
             bar_format=bar_format_lvl2(),
-            leave=False,
         ),
     ):
         if logging_needed(idx_page, len(pdf.pages)):