oreillymedia · delfanbaum · Jan 5, 2023 · Jan 4, 2023 · Jan 4, 2023 · Jan 4, 2023
@@ -10,6 +10,7 @@
 from .reference_processing import (
         process_interal_refs,
         process_ids,
+        process_citations,
         add_glossary_datatypes
     )
 from .code_processing import process_code
@@ -115,7 +116,7 @@ def apply_datatype(chapter, ch_name):
             chapter['data-type'] = ch_stub.lower()  # type: ignore
         else:
             chapter['data-type'] = "afterword"  # type: ignore
-    elif ch_stub.lower()[:4] == "appx":
+    elif ch_stub.lower()[:4] == "appx" or ch_stub == "bibliography":
         chapter['data-type'] = "appendix"
     elif ch_stub.lower() == "glossary":
         chapter['data-type'] = "glossary"
@@ -126,6 +127,25 @@ def apply_datatype(chapter, ch_name):
     return chapter
 
 
+def get_main_section(soup):
+    """
+    Gets the main "section," or the main chapter text, and additionally
+    checks to see if there is a separate bibliography section, returning
+    that if it exists to be dealt with later.
+    """
+    sections = soup.find_all('section')
+    try:
+        main = sections[0]
+    except IndexError:  # does not have a section class for top-level
+        logging.warning("Looks like {toc_element.name} is malformed.")
+        return None, None
+    if len(sections) > 1:
+        bibliography = soup.find('section', id="bibliography")
+    else:
+        bibliography = None
+    return main, bibliography
+
+
 def process_chapter_soup(toc_element: Union[Path, list[Path]]):
     """ unified file chapter processing """
 
@@ -142,34 +162,43 @@ def process_chapter_soup(toc_element: Union[Path, list[Path]]):
         base_soup = BeautifulSoup(f, 'lxml')
 
     # perform initial swapping and namespace designation
-    try:
-        chapter = base_soup.find_all('section')[0]
+    chapter, bib = get_main_section(base_soup)
+
+    if not chapter:
+        return None, None
+
+    else:
         chapter['xmlns'] = 'http://www.w3.org/1999/xhtml'  # type: ignore
         del chapter['class']
 
-    except IndexError:  # does not have a section class for top-level
-        logging.warning("Looks like {toc_element.name} is malformed.")
-        return None, None
+        # promote subheadings within "base" chapter
+        chapter = promote_headings(chapter)
 
-    # promote subheadings within "base" chapter
-    chapter = promote_headings(chapter)
+        if chapter_parts:
+            for subfile in chapter_parts:
+                subsection, sub_bib = process_chapter_subparts(subfile)
+                chapter.append(subsection)
+                if bib and sub_bib:
+                    entries = sub_bib.find_all("dd")
+                    bib.dl.extend(entries)
+                elif sub_bib:
+                    bib = sub_bib
 
-    if chapter_parts:
-        for subfile in chapter_parts:
-            subsection = process_chapter_subparts(subfile)
-            chapter.append(subsection)
+        # apply appropriate data-type (best guess)
+        chapter = apply_datatype(chapter, ch_name)
 
-    # apply appropriate data-type (best guess)
-    chapter = apply_datatype(chapter, ch_name)
+        # add bibliography, if present
+        if bib:
+            chapter.append(bib)
 
-    return chapter, ch_name
+        return chapter, ch_name
 
 
 def process_chapter_subparts(subfile):
     """ processing for chapters with "sections" """
     with open(subfile, 'r') as f:
         soup = BeautifulSoup(f, 'lxml')
-        section = soup.find_all('section')[0]
+        section, bib = get_main_section(soup)
         section['data-type'] = 'sect1'  # type: ignore
         del section['class']  # type: ignore
         # move id from empty span to section
@@ -181,7 +210,7 @@ def process_chapter_subparts(subfile):
         except KeyError:
             # fun fact, this happens when there is numbering on the toc
             pass  # like before, if it's not there that's OK.
-    return section
+    return section, bib
 
 
 def process_chapter(toc_element,
@@ -208,6 +237,7 @@ def process_chapter(toc_element,
     chapter = process_figures(chapter, build_dir)
     chapter = process_informal_figs(chapter, build_dir)
     chapter = process_interal_refs(chapter)
+    chapter = process_citations(chapter)
     chapter = process_footnotes(chapter)
     chapter = process_admonitions(chapter)
     chapter = process_math(chapter)

@@ -10,17 +10,21 @@ def process_interal_refs(chapter):
     references to valid htmlbook xrefs. Currently opinionated towards CMS
     author-date.
     """
-    xrefs = chapter.find_all(class_='internal')
+    xrefs = chapter.find_all("a", class_='internal')
     for ref in xrefs:
-        # handle bib references, be opinionated!
-        if ref['href'].find('references.html') > -1:
+        # handle bib references
+        if (
+                ref.parent.name == "span" and
+                ref.parent.contents[0] == "[" and
+                ref.parent.contents[-1] == "]"
+           ):
             ref.name = 'span'
             del ref['href']
             # remove any internal tags
             inner_str = ''
             for part in ref.contents:
                 inner_str += part.string
-            # remove last comma per CMS
+            # remove last comma (before year/date) per CMS
             inner_str = ','.join(inner_str.split(',')[0:-1]) + \
                         inner_str.split(',')[-1]
             ref.string = f'({inner_str})'
@@ -111,3 +115,18 @@ def add_glossary_datatypes(chapter):
         for defn in defs:
             defn["data-type"] = "glossdef"
     return chapter
+
+
+def process_citations(chapter):
+    """
+    Process and handle bibliographical citations in a chapter
+    """
+    bib_lists = chapter.find_all("dl", class_="citation")
+    for bib in bib_lists:
+        bib.name = "ul"
+        bib["class"] = "author-date"
+        for dt in bib.find_all("dt"):
+            dt.decompose()
+        for dd in bib.find_all("dd"):
+            dd.name = "li"
+    return chapter