Skip to content

Commit

Permalink
Merge pull request useblocks#1 from useblocks/fix_catalog_anno
Browse files Browse the repository at this point in the history
Fixed catalog annotation
  • Loading branch information
ubmarco authored Jan 15, 2021
2 parents b05a0fe + 0f99f55 commit d676ee1
Showing 1 changed file with 66 additions and 39 deletions.
105 changes: 66 additions & 39 deletions libpdf/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from libpdf.utils import decode_title, to_pdfplumber_bbox

from pdfminer.pdftypes import PDFObjRef
from pdfminer.psparser import PSLiteral

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -38,47 +39,50 @@ def get_named_destination(pdf): # pylint: disable=too-many-branches

# check if name tree exist in catalog and extract name tree
name_tree = {}
named_destination = {}
pdf_catalog = pdf.doc.catalog
if 'Names' in pdf_catalog:
# PDF 1.2
if isinstance(pdf_catalog['Names'], PDFObjRef) and 'Dests' in pdf_catalog['Names'].resolve():
name_tree = pdf_catalog['Names'].resolve()['Dests'].resolve()
elif isinstance(pdf_catalog['Names'], dict) and 'Dests' in pdf_catalog['Names']:
name_tree = pdf_catalog['Names']['Dests'].resolve()
# check if name tree not empty
if name_tree:
# map page id to page number
page_id_num_map = {}
for page in pdf.pages:
page_id_num_map[page.page_number] = page.page_obj.pageid

# If key "Kids" exists, it means the name destination catalog is nested in more than one hierarchy.
# In this case, it needs to be flatten by the recursive function resolve_name_obj() for further process.
# name_obj_list always contains a flatten name destination catalog.

# resolve name objects
if 'Kids' in name_tree:
kids_hierarchy = []
kids_hierarchy.extend([kid.resolve() for kid in name_tree['Kids']])
name_obj_list = resolve_name_obj(kids_hierarchy)
else:
name_obj_list = [name_tree]

for index_dest, item_dest in enumerate(name_obj_list):
# In 'Names', odd indices are destination's names, while even indices are the obj id which can be
# referred to the certain page in PDF
for index_name in range(0, len(item_dest['Names']), 2):
named_destination[name_obj_list[index_dest]['Names'][index_name].decode('utf-8')] = name_obj_list[
index_dest
]['Names'][index_name + 1]
elif 'Dests' in pdf_catalog:
# PDF 1.1
if isinstance(pdf_catalog['Dests'], PDFObjRef):
named_destination = pdf_catalog['Dests'].resolve()
elif isinstance(pdf_catalog['Dests'], dict):
named_destination = pdf_catalog['Dests']
else:
LOG.info('Catalog extraction: name destination does not exist...')
return None

# check if name tree not empty
if not name_tree:
LOG.info('Catalog extraction: name destination exists but is empty...')
LOG.debug('Catalog extraction: name destinations do not exist')
return None

# map page id to page number
page_id_num_map = {}
for page in pdf.pages:
page_id_num_map[page.page_number] = page.page_obj.pageid

# If key "Kids" exists, it means the name destination catalog is nested in more than one hierarchy.
# In this case, it needs to be flatten by the recursive function resolve_name_obj() for further process.
# name_obj_list always contains a flatten name destination catalog.

# resolve name objects
if 'Kids' in name_tree:
kids_hierarchy = []
kids_hierarchy.extend([kid.resolve() for kid in name_tree['Kids']])
name_obj_list = resolve_name_obj(kids_hierarchy)
else:
name_obj_list = [name_tree]

named_destination = {}
for index_dest, item_dest in enumerate(name_obj_list):
# In 'Names', odd indices are destination's names, while even indices are the obj id which can be referred to
# the certain page in PDF
for index_name in range(0, len(item_dest['Names']), 2):
named_destination[name_obj_list[index_dest]['Names'][index_name].decode('utf-8')] = name_obj_list[
index_dest
]['Names'][index_name + 1]

for key_object in named_destination:
# only resolve when the value of named_destination is instance of PDFObjRef
if isinstance(named_destination[key_object], PDFObjRef):
Expand Down Expand Up @@ -205,7 +209,7 @@ def chapter_number_giver(chapters_in_outline: List[Dict], virt_hierarchical_leve
chapter_number_giver(chapters_in_outline[idx_chapter]['content'], f'{new_hierarchical_level}.1')


def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disable=too-many-branches
def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disable=too-many-branches, too-many-statements
"""
Resolve outline hierarchy from top level to furthest level recursively.
Expand Down Expand Up @@ -254,7 +258,12 @@ def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disabl
)
else:
# named destination
outline_dest = outline_dest_entry['D'].decode('utf-8')
if isinstance(outline_dest_entry['D'], PSLiteral):
# PDF 1.1 name object
outline_dest = outline_dest_entry['D'].name
else:
# PDF 1.2 byte string
outline_dest = outline_dest_entry['D'].decode('utf-8')
title_bytes = outline_obj['Title'].resolve() # title is a PDFObjRef
else:
# not go-to action, no destination in this document to jump to
Expand All @@ -276,7 +285,12 @@ def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disabl
raise RuntimeError(f"Page {outline_obj['Dest'][0]} is not an indirect reference to a page object")
else:
# named destination
outline_dest = outline_obj['Dest'].decode('utf-8')
if isinstance(outline_obj['Dest'], PSLiteral):
# PDF 1.1 name object
outline_dest = outline_obj['Dest'].name
else:
# PDF 1.2 byte string
outline_dest = outline_obj['Dest'].decode('utf-8')
title_bytes = outline_obj['Title']
else:
raise ValueError('No key A and Dest in outline.')
Expand Down Expand Up @@ -412,12 +426,18 @@ def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf): #
f"Page {ann_resolved_entry['D'][0]} is not an indirect reference to a page object",
)
else:
# implicit destination, ann_resolved['A']['D'] is byte string
# Named destination
if isinstance(ann_resolved_entry['D'], PSLiteral):
# PDF 1.1 name object
des_name = ann_resolved_entry['D'].name
else:
# PDF 1.2 byte string
des_name = ann_resolved_entry['D'].decode('utf-8')
annotation_page_map[idx_page + 1]['annotation'].append(
{
'text': ann_text,
'rect': ann_resolved['Rect'],
'des_name': ann_resolved_entry['D'].decode('utf-8'),
'des_name': des_name,
},
)
else:
Expand All @@ -443,9 +463,16 @@ def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf): #
else:
raise RuntimeError(f"Page {ann_resolved['Dest'][0]} is not an indirect reference to a page object")
else:
# implicit destination
# Named destination
if isinstance(ann_resolved['Dest'], PSLiteral):
# PDF 1.1 name object
des_name = ann_resolved['Dest'].name
else:
# PDF 1.2 byte string
des_name = ann_resolved['Dest'].decode('utf-8')

annotation_page_map[idx_page + 1]['annotation'].append(
{'text': ann_text, 'rect': ann_resolved['Rect'], 'des_name': ann_resolved['Dest'].decode('utf-8')},
{'text': ann_text, 'rect': ann_resolved['Rect'], 'des_name': des_name},
)
else:
raise Exception('Key "A" and "Dest" do not exist in annotations.')
Expand Down

0 comments on commit d676ee1

Please sign in to comment.