Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions scielo_classic_website/spsxml/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,15 @@ def analyze_xref(text: str = None, rid: str = None) -> Dict[str, Optional[str]]:
result["prefix"] = prefix
result["number"] = number
result["source"] = "text"
else:
ref_type_text, element_name_text, prefix, number = detect_from_text(text.split()[0])
if ref_type_text:
result["ref_type"] = ref_type_text
result["element_name"] = element_name_text
result["prefix"] = prefix
result["number"] = number
result["source"] = "text"
Comment on lines +196 to +202
Copy link

Copilot AI Jan 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If text is an empty string or contains only whitespace, text.split() will return an empty list, causing an IndexError when accessing index [0]. Add a check to ensure the split result is not empty before accessing the first element.

Suggested change
ref_type_text, element_name_text, prefix, number = detect_from_text(text.split()[0])
if ref_type_text:
result["ref_type"] = ref_type_text
result["element_name"] = element_name_text
result["prefix"] = prefix
result["number"] = number
result["source"] = "text"
parts = text.split()
if parts:
ref_type_text, element_name_text, prefix, number = detect_from_text(parts[0])
if ref_type_text:
result["ref_type"] = ref_type_text
result["element_name"] = element_name_text
result["prefix"] = prefix
result["number"] = number
result["source"] = "text"

Copilot uses AI. Check for mistakes.


Copy link

Copilot AI Jan 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is an extra blank line between the end of the else block and the comment for the next section. This should be removed to maintain consistent code formatting.

Suggested change

Copilot uses AI. Check for mistakes.
# Análise do ID
if rid:
Expand Down
9 changes: 7 additions & 2 deletions scielo_classic_website/spsxml/detector_config_xref.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,9 @@
# Africâner
r"(?i)^figs?\.?(?:\s*\d+)?$",
r"(?i)^figure?(?:\s*\d+)?$",
r"(?i)^beelde?(?:\s*\d+)?$"
r"(?i)^beelde?(?:\s*\d+)?$",
# Padrão Fig. No
r"(?i)^fig\.?\s*no\.?\s*\d+$"
],
# Tabelas
"table": [
Expand All @@ -107,7 +109,9 @@
r"(?i)^tafeln?(?:\s*\d+)?$",
# Africâner
r"(?i)^tabelle?(?:\s*\d+)?$",
r"(?i)^tabs?\.?(?:\s*\d+)?$"
r"(?i)^tabs?\.?(?:\s*\d+)?$",
# Padrão cuadro No
r"(?i)^cuadro\s*no\.?\s*\d+$"
],
# Referências bibliográficas
"bibr": [
Expand Down Expand Up @@ -548,6 +552,7 @@
r"^c\d+": "corresp",
r"^e\d+": "disp-formula",
r"^f\d+": "fig",
r"^cuadro\d+": "table", # cuadro em espanhol para table
r"^gf\d+": "graphic",
r"^suppl\d+": "supplementary-material",
r"^m\d+": "math",
Expand Down
10 changes: 7 additions & 3 deletions scielo_classic_website/spsxml/sps_xml_body_pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
detect_from_text,
detect_element_type,
detect_sec_type,
detect_from_id,
)
from scielo_classic_website.spsxml.detector_title_parent import identify_parent_by_title
from scielo_classic_website.htmlbody.html_merger import (
Expand Down Expand Up @@ -128,7 +129,7 @@ def convert_html_to_xml(document):
convert_html_to_xml_step_60_ahref_and_aname,
convert_html_to_xml_step_70_complete_fig_and_tablewrap,
convert_html_to_xml_step_80_fix_sec,
Copy link

Copilot AI Jan 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The convert_html_to_xml_step_90_complete_disp_formula step is commented out without explanation. Add a comment explaining why this step is temporarily disabled and under what conditions it should be re-enabled.

Suggested change
convert_html_to_xml_step_80_fix_sec,
convert_html_to_xml_step_80_fix_sec,
# NOTE: Step 90 (`convert_html_to_xml_step_90_complete_disp_formula`) is
# temporarily disabled because it may incorrectly transform some <disp-formula>
# elements and generate invalid SPS XML. Re-enable this step only after the
# underlying issues are fixed and regression tests for complex formula markup
# are in place and passing.

Copilot uses AI. Check for mistakes.
convert_html_to_xml_step_90_complete_disp_formula,
# convert_html_to_xml_step_90_complete_disp_formula,
convert_html_to_xml_step_95_fix_body,
)
document.exceptions = []
Expand Down Expand Up @@ -1135,8 +1136,10 @@ def transform(self, data):
self.remove_top_and_back(xml)
self.remove_multiplicity(xml)
for node in xml.xpath(".//a[@name]"):
node.tag = "element"
node.set("id", node.attrib.pop("name"))
name = node.attrib.pop("name")
Copy link

Copilot AI Jan 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The refactored ANamePipe logic now uses detect_from_id to determine element types, but the existing test only covers the case where the name doesn't match any pattern (expecting <div> as fallback). Add tests for cases where the name matches known patterns like 'f1' (should become <fig>), 't1' (should become <table-wrap>), and 'cuadro1' (should become <table-wrap>).

Copilot uses AI. Check for mistakes.
ref_type, elem = detect_from_id(name)
node.tag = elem or "element"
node.set("id", name)
return data


Expand Down Expand Up @@ -1910,6 +1913,7 @@ def rename_center(self, root):
center.tag = "p"
continue
center.tag = "title"
Copy link

Copilot AI Jan 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The addition of ET.strip_tags(root, 'STRIPTAG') in the rename_center method is not explained. Add a comment explaining why STRIPTAG elements need to be stripped at this point in the processing pipeline.

Suggested change
center.tag = "title"
center.tag = "title"
# Remove os elementos marcados com STRIPTAG, usados apenas como marcadores
# temporários para <center> vazios, para que essas tags artificiais não
# apareçam no XML final.

Copilot uses AI. Check for mistakes.
ET.strip_tags(root, "STRIPTAG")


class XMLBoldToTitlePipe(plumber.Pipe):
Expand Down