Skip to content

Commit

Permalink
Merge pull request useblocks#15 from useblocks/modif-annotation
Browse files Browse the repository at this point in the history
Flag to disable annotation extraction
  • Loading branch information
ubmarco authored Apr 26, 2022
2 parents d7424d5 + 6f87bee commit d9df43a
Show file tree
Hide file tree
Showing 9 changed files with 224 additions and 202 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,7 @@ pip-selfcheck.json
.vscode/launch.json
!.vscode/launch.json.default
!.vscode/settings.json.default

# Ignore the output from libpdf
visual_debug*
out*
6 changes: 6 additions & 0 deletions docs/contents/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ Fixed

- Fixed catalog outline title resolve issue (`PR #10 <https://github.com/useblocks/libpdf/pull/10>`_)

Added
~~~~~

- Added flag ``--no_annotations`` to exclude annotation extraction from the catalog to speed up extraction
(`PR #15 <https://github.com/useblocks/libpdf/pull/15>`_)

__ https://github.com/useblocks/libpdf/releases/tag/v0.0.1

`0.0.1`__ - 2020-06-30
Expand Down
17 changes: 13 additions & 4 deletions libpdf/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@

from libpdf.log import logging_needed
from libpdf.parameters import ANNO_X_TOLERANCE, ANNO_Y_TOLERANCE
from libpdf.progress import bar_format_lvl2, tqdm
from libpdf.utils import decode_title, to_pdfplumber_bbox

from pdfminer.pdftypes import PDFObjRef
from pdfminer.psparser import PSLiteral


LOG = logging.getLogger(__name__)


Expand Down Expand Up @@ -500,7 +502,9 @@ def annotation_dict_extraction(pdf):

annotation_page_map = {}

for idx_page, page in enumerate(pdf.pages):
for idx_page, page in enumerate(
tqdm(pdf.pages, desc='###### Extracting annotations', unit='pages', bar_format=bar_format_lvl2()),
):
if logging_needed(idx_page, len(pdf.pages)):
LOG.debug('Catalog extraction: annotations page %s of %s', idx_page + 1, len(pdf.pages))

Expand Down Expand Up @@ -639,7 +643,7 @@ def _resolve_pdf_obj_refs(
return resolved_dict, resolved_list


def extract_catalog(pdf):
def extract_catalog(pdf, no_annotations: bool):
"""
Extract catalog document of a PDF.
Expand All @@ -658,8 +662,13 @@ def extract_catalog(pdf):
# resolved_catalog, _ = _resolve_pdf_obj_refs(pdf.doc.catalog, resolved_objects)
# del resolved_catalog # denote it is not yet used

# extract annotation (link source) and store in the dict by pages for further process of links on texts in extract()
ann_dict = annotation_dict_extraction(pdf)
if no_annotations:
ann_dict = None
LOG.info('Catalog extraction: annotations is excluded')
else:
# extract annotation (link source) and store in the dict by pages for further process of links
# on texts in extract()
ann_dict = annotation_dict_extraction(pdf)

# extract name destination (link target)and store in the dict for further process in extract()
des_dict = get_named_destination(pdf)
Expand Down
16 changes: 15 additions & 1 deletion libpdf/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
output_path: str = None,
save_figures: bool = False,
figure_dir: str = None,
no_annotations: bool = False,
no_chapters: bool = False,
no_paragraphs: bool = False,
no_tables: bool = False,
Expand Down Expand Up @@ -60,6 +61,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
:param figure_dir: output directory for extracted figures; if it does not exist, it will be created
:param output_format: only relevant for CLI, allowed values are json, yaml or stdout
:param output_path: only relevant for CLI, path to the output file for output_formats json or yaml
:param no_annotations: flag triggering the exclusion of annotations from pdf catalog
:param no_chapters: flag triggering the exclusion of chapters (flat structure of elements)
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
Expand Down Expand Up @@ -106,6 +108,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
LOG.info('Page range: [%s]', 'all' if not pages else ','.join(str(x) for x in pages))
LOG.info('Page crop: %s', 'not cropped' if not page_crop else ' '.join(str(x) for x in page_crop))
LOG.info('Smart page crop: %s', 'on' if smart_page_crop else 'off')
LOG.info('Extract annotations: %s', 'no' if no_annotations else 'yes')
LOG.info('Extract chapters: %s', 'no' if no_chapters else 'yes')
LOG.info('Extract paragraphs: %s', 'no' if no_paragraphs else 'yes')
LOG.info('Extract tables: %s', 'no' if no_tables else 'yes')
Expand All @@ -118,6 +121,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
smart_page_crop,
save_figures,
figure_dir,
no_annotations,
no_chapters,
no_paragraphs,
no_tables,
Expand Down Expand Up @@ -153,14 +157,15 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
return None


def main_api( # pylint: disable=too-many-arguments
def main_api( # pylint: disable=too-many-arguments, too-many-locals
pdf: str,
verbose: int = 1, # log level WARNING for library usage is considered a good compromise as a default
page_range: str = None,
page_crop: Tuple[float, float, float, float] = None,
smart_page_crop: bool = False,
save_figures: bool = False,
figure_dir: str = 'figures',
no_annotations: bool = False,
no_chapters: bool = False,
no_paragraphs: bool = False,
no_tables: bool = False,
Expand All @@ -185,6 +190,7 @@ def main_api( # pylint: disable=too-many-arguments
:param smart_page_crop: see description in function core.main()
:param save_figures: flag triggering the export of figures to the figure_dir
:param figure_dir: output directory for extracted figures; if it does not exist, it will be created
:param no_annotations: flag triggering the exclusion of annotations from pdf catalog
:param no_chapters: flag triggering the exclusion of chapters (resulting in a flat list of elements)
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
Expand Down Expand Up @@ -227,6 +233,7 @@ def main_api( # pylint: disable=too-many-arguments
smart_page_crop=smart_page_crop,
save_figures=save_figures,
figure_dir=figure_dir,
no_annotations=no_annotations,
no_chapters=no_chapters,
no_paragraphs=no_paragraphs,
no_tables=no_tables,
Expand Down Expand Up @@ -409,6 +416,13 @@ def handle_parse_result(self, ctx, opts, args):
show_default=True,
help='Output directory for extracted figures; if it does not exist, it will be created',
)
@click.option(
'--no-annotations',
is_flag=True,
show_default=True,
help='Do not extract annotations from catalog. All PDF-internal links will not be resolved.'
' Chapter detection however will work',
)
@click.option(
'--no-chapters',
is_flag=True,
Expand Down
6 changes: 4 additions & 2 deletions libpdf/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,13 @@ def folded_str_representer(dumper, text):
yaml.add_representer(FoldedStr, folded_str_representer)


def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-statements
def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-statements, too-many-arguments
pdf_path: str,
pages: Optional[List[int]],
smart_page_crop: bool,
save_figures: bool,
figure_dir: Optional[str],
no_annotations: bool,
no_chapters: bool,
no_paragraphs: bool,
no_tables: bool,
Expand All @@ -71,6 +72,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
:param smart_page_crop: see description in function core.main()
:param save_figures: flag triggering the export of figures to the figure_dir
:param figure_dir: output directory for extracted figures
:param no_annotations: flag triggering the exclusion of annotations from pdf catalog
:param no_chapters: flag triggering the exclusion of chapters (flat structure of elements)
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
Expand Down Expand Up @@ -138,7 +140,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
overall_pbar.update(1)

# extract annotations, name destinations and outline
extract_catalog(pdf)
extract_catalog(pdf, no_annotations)
overall_pbar.update(10)

# In figure_dict, figures are sorted by pages and y coordinates.
Expand Down
2 changes: 1 addition & 1 deletion libpdf/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ def find_target_id(link: Link, pages_list: List[Page], src_element: Element) ->

text = str(src_element)
text_shortened = (text[:60] + '..') if len(text) > 60 else text
LOG.warning(
LOG.debug(
'The link "%s" on page %s could not be resolved to a libpdf element; replacing it with the raw '
'target page coordinate %s',
text_shortened,
Expand Down
16 changes: 13 additions & 3 deletions libpdf/textbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def extract_paragraphs_chapters(
LOG.info('Excluding chapters extraction')
else:
if catalog['outline']:
LOG.info('Extracting chapters ...')
chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf)

paragraph_list = []
Expand Down Expand Up @@ -165,7 +166,12 @@ def render_chapters( # pylint: disable=too-many-branches, too-many-locals
chapters_sorted_by_page[chapter['position']['page']] = []
chapters_sorted_by_page[chapter['position']['page']].append(chapter)

for page_number, chapters in chapters_sorted_by_page.items():
for page_number, chapters in tqdm(
chapters_sorted_by_page.items(),
desc='###### Extracting chapters',
unit='pages',
bar_format=bar_format_lvl2(),
):
if page_number - 1 in page_lt_textboxes_filtered:
lt_textboxes = page_lt_textboxes_filtered[page_number - 1]
for page in page_list:
Expand Down Expand Up @@ -477,7 +483,12 @@ def render_paragraphs( # pylint: disable=too-many-branches
paragraph_list = []
paragraph_id = 1

for page_index, lt_textboxes in page_lt_textboxes_filtered.items():
for page_index, lt_textboxes in tqdm(
page_lt_textboxes_filtered.items(),
desc='###### Extracting paragraphs',
unit='pages',
bar_format=bar_format_lvl2(),
):
# add lt_textbox to a list of paragraphs
for lt_textbox in lt_textboxes:
# get position of lt_textbox
Expand Down Expand Up @@ -848,7 +859,6 @@ def pdfminer_get_lt_textboxes(pdf) -> Dict[int, List[LTTextBox]]:
desc='###### Extracting layout',
unit='pages',
bar_format=bar_format_lvl2(),
leave=False,
),
):
if logging_needed(idx_page, len(pdf.pages)):
Expand Down
Loading

0 comments on commit d9df43a

Please sign in to comment.