Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 47 additions & 17 deletions jupyter_book_to_htmlbook/file_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .reference_processing import (
process_interal_refs,
process_ids,
process_citations,
add_glossary_datatypes
)
from .code_processing import process_code
Expand Down Expand Up @@ -115,7 +116,7 @@ def apply_datatype(chapter, ch_name):
chapter['data-type'] = ch_stub.lower() # type: ignore
else:
chapter['data-type'] = "afterword" # type: ignore
elif ch_stub.lower()[:4] == "appx":
elif ch_stub.lower()[:4] == "appx" or ch_stub == "bibliography":
chapter['data-type'] = "appendix"
elif ch_stub.lower() == "glossary":
chapter['data-type'] = "glossary"
Expand All @@ -126,6 +127,25 @@ def apply_datatype(chapter, ch_name):
return chapter


def get_main_section(soup):
"""
Gets the main "section," or the main chapter text, and additionally
checks to see if there is a separate bibliography section, returning
that if it exists to be dealt with later.
"""
sections = soup.find_all('section')
try:
main = sections[0]
except IndexError: # does not have a section class for top-level
logging.warning("Looks like {toc_element.name} is malformed.")
return None, None
if len(sections) > 1:
bibliography = soup.find('section', id="bibliography")
else:
bibliography = None
return main, bibliography


def process_chapter_soup(toc_element: Union[Path, list[Path]]):
""" unified file chapter processing """

Expand All @@ -142,34 +162,43 @@ def process_chapter_soup(toc_element: Union[Path, list[Path]]):
base_soup = BeautifulSoup(f, 'lxml')

# perform initial swapping and namespace designation
try:
chapter = base_soup.find_all('section')[0]
chapter, bib = get_main_section(base_soup)

if not chapter:
return None, None

else:
chapter['xmlns'] = 'http://www.w3.org/1999/xhtml' # type: ignore
del chapter['class']

except IndexError: # does not have a section class for top-level
logging.warning("Looks like {toc_element.name} is malformed.")
return None, None
# promote subheadings within "base" chapter
chapter = promote_headings(chapter)

# promote subheadings within "base" chapter
chapter = promote_headings(chapter)
if chapter_parts:
for subfile in chapter_parts:
subsection, sub_bib = process_chapter_subparts(subfile)
chapter.append(subsection)
if bib and sub_bib:
entries = sub_bib.find_all("dd")
bib.dl.extend(entries)
elif sub_bib:
bib = sub_bib

if chapter_parts:
for subfile in chapter_parts:
subsection = process_chapter_subparts(subfile)
chapter.append(subsection)
# apply appropriate data-type (best guess)
chapter = apply_datatype(chapter, ch_name)

# apply appropriate data-type (best guess)
chapter = apply_datatype(chapter, ch_name)
# add bibliography, if present
if bib:
chapter.append(bib)

return chapter, ch_name
return chapter, ch_name


def process_chapter_subparts(subfile):
""" processing for chapters with "sections" """
with open(subfile, 'r') as f:
soup = BeautifulSoup(f, 'lxml')
section = soup.find_all('section')[0]
section, bib = get_main_section(soup)
section['data-type'] = 'sect1' # type: ignore
del section['class'] # type: ignore
# move id from empty span to section
Expand All @@ -181,7 +210,7 @@ def process_chapter_subparts(subfile):
except KeyError:
# fun fact, this happens when there is numbering on the toc
pass # like before, if it's not there that's OK.
return section
return section, bib


def process_chapter(toc_element,
Expand All @@ -208,6 +237,7 @@ def process_chapter(toc_element,
chapter = process_figures(chapter, build_dir)
chapter = process_informal_figs(chapter, build_dir)
chapter = process_interal_refs(chapter)
chapter = process_citations(chapter)
chapter = process_footnotes(chapter)
chapter = process_admonitions(chapter)
chapter = process_math(chapter)
Expand Down
27 changes: 23 additions & 4 deletions jupyter_book_to_htmlbook/reference_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,21 @@ def process_interal_refs(chapter):
references to valid htmlbook xrefs. Currently opinionated towards CMS
author-date.
"""
xrefs = chapter.find_all(class_='internal')
xrefs = chapter.find_all("a", class_='internal')
for ref in xrefs:
# handle bib references, be opinionated!
if ref['href'].find('references.html') > -1:
# handle bib references
if (
ref.parent.name == "span" and
ref.parent.contents[0] == "[" and
ref.parent.contents[-1] == "]"
):
ref.name = 'span'
del ref['href']
# remove any internal tags
inner_str = ''
for part in ref.contents:
inner_str += part.string
# remove last comma per CMS
# remove last comma (before year/date) per CMS
inner_str = ','.join(inner_str.split(',')[0:-1]) + \
inner_str.split(',')[-1]
ref.string = f'({inner_str})'
Expand Down Expand Up @@ -111,3 +115,18 @@ def add_glossary_datatypes(chapter):
for defn in defs:
defn["data-type"] = "glossdef"
return chapter


def process_citations(chapter):
"""
Process and handle bibliographical citations in a chapter
"""
bib_lists = chapter.find_all("dl", class_="citation")
for bib in bib_lists:
bib.name = "ul"
bib["class"] = "author-date"
for dt in bib.find_all("dt"):
dt.decompose()
for dd in bib.find_all("dd"):
dd.name = "li"
return chapter
Loading