Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flag to disable annotation extraction #15

Merged
merged 9 commits into from
Apr 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,7 @@ pip-selfcheck.json
.vscode/launch.json
!.vscode/launch.json.default
!.vscode/settings.json.default

# Ignore the output from libpdf
visual_debug*
out*
6 changes: 6 additions & 0 deletions docs/contents/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ Fixed

- Fixed catalog outline title resolve issue (`PR #10 <https://github.com/useblocks/libpdf/pull/10>`_)

Added
~~~~~

- Added flag ``--no_annotations`` to exclude annotation extraction from the catalog to speed up extraction
(`PR #15 <https://github.com/useblocks/libpdf/pull/15>`_)

__ https://github.com/useblocks/libpdf/releases/tag/v0.0.1

`0.0.1`__ - 2020-06-30
Expand Down
17 changes: 13 additions & 4 deletions libpdf/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@

from libpdf.log import logging_needed
from libpdf.parameters import ANNO_X_TOLERANCE, ANNO_Y_TOLERANCE
from libpdf.progress import bar_format_lvl2, tqdm
from libpdf.utils import decode_title, to_pdfplumber_bbox

from pdfminer.pdftypes import PDFObjRef
from pdfminer.psparser import PSLiteral


LOG = logging.getLogger(__name__)


Expand Down Expand Up @@ -500,7 +502,9 @@ def annotation_dict_extraction(pdf):

annotation_page_map = {}

for idx_page, page in enumerate(pdf.pages):
for idx_page, page in enumerate(
tqdm(pdf.pages, desc='###### Extracting annotations', unit='pages', bar_format=bar_format_lvl2()),
):
if logging_needed(idx_page, len(pdf.pages)):
LOG.debug('Catalog extraction: annotations page %s of %s', idx_page + 1, len(pdf.pages))

Expand Down Expand Up @@ -639,7 +643,7 @@ def _resolve_pdf_obj_refs(
return resolved_dict, resolved_list


def extract_catalog(pdf):
def extract_catalog(pdf, no_annotations: bool):
"""
Extract catalog document of a PDF.

Expand All @@ -658,8 +662,13 @@ def extract_catalog(pdf):
# resolved_catalog, _ = _resolve_pdf_obj_refs(pdf.doc.catalog, resolved_objects)
# del resolved_catalog # denote it is not yet used

# extract annotation (link source) and store in the dict by pages for further process of links on texts in extract()
ann_dict = annotation_dict_extraction(pdf)
if no_annotations:
ann_dict = None
LOG.info('Catalog extraction: annotations is excluded')
else:
# extract annotation (link source) and store in the dict by pages for further process of links
# on texts in extract()
ann_dict = annotation_dict_extraction(pdf)

# extract name destination (link target)and store in the dict for further process in extract()
des_dict = get_named_destination(pdf)
Expand Down
16 changes: 15 additions & 1 deletion libpdf/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
output_path: str = None,
save_figures: bool = False,
figure_dir: str = None,
no_annotations: bool = False,
no_chapters: bool = False,
no_paragraphs: bool = False,
no_tables: bool = False,
Expand Down Expand Up @@ -60,6 +61,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
:param figure_dir: output directory for extracted figures; if it does not exist, it will be created
:param output_format: only relevant for CLI, allowed values are json, yaml or stdout
:param output_path: only relevant for CLI, path to the output file for output_formats json or yaml
:param no_annotations: flag triggering the exclusion of annotations from pdf catalog
:param no_chapters: flag triggering the exclusion of chapters (flat structure of elements)
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
Expand Down Expand Up @@ -106,6 +108,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
LOG.info('Page range: [%s]', 'all' if not pages else ','.join(str(x) for x in pages))
LOG.info('Page crop: %s', 'not cropped' if not page_crop else ' '.join(str(x) for x in page_crop))
LOG.info('Smart page crop: %s', 'on' if smart_page_crop else 'off')
LOG.info('Extract annotations: %s', 'no' if no_annotations else 'yes')
LOG.info('Extract chapters: %s', 'no' if no_chapters else 'yes')
LOG.info('Extract paragraphs: %s', 'no' if no_paragraphs else 'yes')
LOG.info('Extract tables: %s', 'no' if no_tables else 'yes')
Expand All @@ -118,6 +121,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
smart_page_crop,
save_figures,
figure_dir,
no_annotations,
no_chapters,
no_paragraphs,
no_tables,
Expand Down Expand Up @@ -153,14 +157,15 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
return None


def main_api( # pylint: disable=too-many-arguments
def main_api( # pylint: disable=too-many-arguments, too-many-locals
pdf: str,
verbose: int = 1, # log level WARNING for library usage is considered a good compromise as a default
page_range: str = None,
page_crop: Tuple[float, float, float, float] = None,
smart_page_crop: bool = False,
save_figures: bool = False,
figure_dir: str = 'figures',
no_annotations: bool = False,
no_chapters: bool = False,
no_paragraphs: bool = False,
no_tables: bool = False,
Expand All @@ -185,6 +190,7 @@ def main_api( # pylint: disable=too-many-arguments
:param smart_page_crop: see description in function core.main()
:param save_figures: flag triggering the export of figures to the figure_dir
:param figure_dir: output directory for extracted figures; if it does not exist, it will be created
:param no_annotations: flag triggering the exclusion of annotations from pdf catalog
:param no_chapters: flag triggering the exclusion of chapters (resulting in a flat list of elements)
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
Expand Down Expand Up @@ -227,6 +233,7 @@ def main_api( # pylint: disable=too-many-arguments
smart_page_crop=smart_page_crop,
save_figures=save_figures,
figure_dir=figure_dir,
no_annotations=no_annotations,
no_chapters=no_chapters,
no_paragraphs=no_paragraphs,
no_tables=no_tables,
Expand Down Expand Up @@ -409,6 +416,13 @@ def handle_parse_result(self, ctx, opts, args):
show_default=True,
help='Output directory for extracted figures; if it does not exist, it will be created',
)
@click.option(
'--no-annotations',
is_flag=True,
show_default=True,
help='Do not extract annotations from catalog. All PDF-internal links will not be resolved.'
' Chapter detection however will work',
)
@click.option(
'--no-chapters',
is_flag=True,
Expand Down
6 changes: 4 additions & 2 deletions libpdf/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,13 @@ def folded_str_representer(dumper, text):
yaml.add_representer(FoldedStr, folded_str_representer)


def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-statements
def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-statements, too-many-arguments
pdf_path: str,
pages: Optional[List[int]],
smart_page_crop: bool,
save_figures: bool,
figure_dir: Optional[str],
no_annotations: bool,
no_chapters: bool,
no_paragraphs: bool,
no_tables: bool,
Expand All @@ -71,6 +72,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
:param smart_page_crop: see description in function core.main()
:param save_figures: flag triggering the export of figures to the figure_dir
:param figure_dir: output directory for extracted figures
:param no_annotations: flag triggering the exclusion of annotations from pdf catalog
:param no_chapters: flag triggering the exclusion of chapters (flat structure of elements)
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
Expand Down Expand Up @@ -138,7 +140,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
overall_pbar.update(1)

# extract annotations, name destinations and outline
extract_catalog(pdf)
extract_catalog(pdf, no_annotations)
overall_pbar.update(10)

# In figure_dict, figures are sorted by pages and y coordinates.
Expand Down
2 changes: 1 addition & 1 deletion libpdf/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ def find_target_id(link: Link, pages_list: List[Page], src_element: Element) ->

text = str(src_element)
text_shortened = (text[:60] + '..') if len(text) > 60 else text
LOG.warning(
LOG.debug(
'The link "%s" on page %s could not be resolved to a libpdf element; replacing it with the raw '
'target page coordinate %s',
text_shortened,
Expand Down
16 changes: 13 additions & 3 deletions libpdf/textbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def extract_paragraphs_chapters(
LOG.info('Excluding chapters extraction')
else:
if catalog['outline']:
LOG.info('Extracting chapters ...')
chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf)

paragraph_list = []
Expand Down Expand Up @@ -165,7 +166,12 @@ def render_chapters( # pylint: disable=too-many-branches, too-many-locals
chapters_sorted_by_page[chapter['position']['page']] = []
chapters_sorted_by_page[chapter['position']['page']].append(chapter)

for page_number, chapters in chapters_sorted_by_page.items():
for page_number, chapters in tqdm(
chapters_sorted_by_page.items(),
desc='###### Extracting chapters',
unit='pages',
bar_format=bar_format_lvl2(),
):
if page_number - 1 in page_lt_textboxes_filtered:
lt_textboxes = page_lt_textboxes_filtered[page_number - 1]
for page in page_list:
Expand Down Expand Up @@ -477,7 +483,12 @@ def render_paragraphs( # pylint: disable=too-many-branches
paragraph_list = []
paragraph_id = 1

for page_index, lt_textboxes in page_lt_textboxes_filtered.items():
for page_index, lt_textboxes in tqdm(
page_lt_textboxes_filtered.items(),
desc='###### Extracting paragraphs',
unit='pages',
bar_format=bar_format_lvl2(),
):
# add lt_textbox to a list of paragraphs
for lt_textbox in lt_textboxes:
# get position of lt_textbox
Expand Down Expand Up @@ -848,7 +859,6 @@ def pdfminer_get_lt_textboxes(pdf) -> Dict[int, List[LTTextBox]]:
desc='###### Extracting layout',
unit='pages',
bar_format=bar_format_lvl2(),
leave=False,
),
):
if logging_needed(idx_page, len(pdf.pages)):
Expand Down
Loading