Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 170 additions & 48 deletions scielo_classic_website/spsxml/sps_xml_body_pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
from copy import deepcopy
from io import StringIO
import re

import plumber
from lxml import etree as ET
Expand All @@ -19,20 +20,50 @@
"e": "disp-formula",
}


ELEM_NAME = {
LABEL_INITIAL_TO_ELEMENT = {
"t": "table-wrap",
"f": "fig",
"e": "disp-formula",
"c": "table-wrap",
"c": "table-wrap", # cuadro
"a": "app", # appendix, anexo
}

FILENAME_TO_ELEMENT = {}
FILENAME_TO_ELEMENT.update(LABEL_INITIAL_TO_ELEMENT)
FILENAME_TO_ELEMENT["i"] = "fig"


ELEM_AND_REF_TYPE = {
"table-wrap": "table",
}


def get_letter_and_number(codigo):
"""
Verifica se a string inteira corresponde exatamente ao padrão:
[Letra (maiúscula/minúscula)][Um ou mais dígitos].
Se corresponder (ex: 'f1', 'A99'), retorna a string original.
Se não corresponder (ex: '1f', 'f1a'), retorna None.
"""

# Expressão Regular: r"^[a-zA-Z]\d+$"
# ^: Início da string
# [a-zA-Z]: Exatamente uma letra
# \d+: Um ou mais dígitos
# $: Fim da string
regex = r"^[a-zA-Z]\d+$"

# re.fullmatch() verifica se a string inteira corresponde ao padrão
match = re.fullmatch(regex, codigo)

if match:
# Se o padrão casar com a string inteira, retorna o valor original
return codigo
else:
# Caso contrário, retorna None
Comment on lines +49 to +63
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Inline comments are in Portuguese (lines 49-53, 56, 60, 63) while the docstring is also in Portuguese. Consider whether this is consistent with the project's documentation standards. If the project uses English for code comments, these should be translated for consistency.

Suggested change
# Expressão Regular: r"^[a-zA-Z]\d+$"
# ^: Início da string
# [a-zA-Z]: Exatamente uma letra
# \d+: Um ou mais dígitos
# $: Fim da string
regex = r"^[a-zA-Z]\d+$"
# re.fullmatch() verifica se a string inteira corresponde ao padrão
match = re.fullmatch(regex, codigo)
if match:
# Se o padrão casar com a string inteira, retorna o valor original
return codigo
else:
# Caso contrário, retorna None
# Regular Expression: r"^[a-zA-Z]\d+$"
# ^: Start of string
# [a-zA-Z]: Exactly one letter
# \d+: One or more digits
# $: End of string
regex = r"^[a-zA-Z]\d+$"
# re.fullmatch() checks if the entire string matches the pattern
match = re.fullmatch(regex, codigo)
if match:
# If the pattern matches the entire string, return the original value
return codigo
else:
# Otherwise, return None

Copilot uses AI. Check for mistakes.
return None


class XMLBodyAnBackConvertException(Exception): ...


Expand Down Expand Up @@ -257,6 +288,7 @@ def convert_html_to_xml_step_4(document):
# logging.info("convert_html_to_xml - step 4")
ppl = plumber.Pipeline(
StartPipe(),
ReplaceIdhrefAndRidhrefByIdPipe(),
DivIdToAssetPipe(),
XRefTypePipe(),
InsertGraphicInFigPipe(),
Expand Down Expand Up @@ -831,7 +863,7 @@ def parser_node(self, node, journal_acron):
if "img/revistas/" in href or ".." in href:
return self._create_internal_link_to_asset_html_page(node)

if journal_acron and journal_acron in href:
if journal_acron and f"/{journal_acron}/" in href.lower():
return self._create_internal_link_to_asset_html_page(node)

if ":" in href:
Expand Down Expand Up @@ -950,7 +982,32 @@ def transform(self, data):
def _extract_xref_text(self, xref_element):
return " ".join(xref_element.xpath(".//text()")).strip()

def _extract_rid(self, href, pkg_name, label_text, label_number):
def get_rid_from_xref_label_and_number(self, label_text, label_number):
"""
Gera o rid a partir do label_text e label_number.

Args:
label_text: Texto do label (e.g., 'Table', 'Figure')
label_number: Número do label (e.g., '1', '2')

Returns:
String com o rid ou None
"""
if not label_text:
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential IndexError if label_text is an empty string. Before accessing label_text[0] on line 998, you should verify that label_text has at least one character. The condition on line 995 only checks if it's not None/empty, but an empty string would pass that check and cause an IndexError.

Suggested change
if not label_text:
if not label_text or label_text == "":

Copilot uses AI. Check for mistakes.
return None

element_prefix = label_text[0].lower()
if not label_number:
return element_prefix

if label_number.isdigit():
return f"{element_prefix}{label_number}"

if label_number[:-1].isdigit() and label_number[-1].isalpha():
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential IndexError if label_number is an empty string. Before accessing label_number[:-1] and label_number[-1], you should check that label_number has at least one character. An empty string would cause an IndexError when accessing these indices.

Suggested change
if label_number[:-1].isdigit() and label_number[-1].isalpha():
if len(label_number) > 1 and label_number[:-1].isdigit() and label_number[-1].isalpha():

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Double space found between and and label_number. This should be a single space for consistency.

Suggested change
if label_number[:-1].isdigit() and label_number[-1].isalpha():
if label_number[:-1].isdigit() and label_number[-1].isalpha():

Copilot uses AI. Check for mistakes.
return f"{element_prefix}{label_number[:-1]}"
return None

def get_rid_from_href_and_pkg_name(self, href, pkg_name):
"""
Extrai o rid a partir do href e nome do pacote.

Expand All @@ -961,45 +1018,48 @@ def _extract_rid(self, href, pkg_name, label_text, label_number):
Returns:
String com o rid ou None
"""
if label_text and label_number and label_number.isdigit():
try:
return ELEM_NAME.get(label_text[0].lower())[0] + str(label_number)
except (IndexError, AttributeError, ValueError):
pass

basename = os.path.basename(href)
filename, _ = os.path.splitext(basename)
if filename.startswith(pkg_name):
rid = filename.replace(pkg_name, "")
if rid:
return rid

greater_pos = -1
rid = None
for k, v in ELEM_NAME.items():
position = filename.rfind(k)
if position > greater_pos:
rid = k
greater_pos = position
if rid:
rid = filename[greater_pos:]
return rid
return filename

def parse_xref_text(self, xref_text, label_text):
# Tables 1-3
parts = xref_text.split(" ")

if len(parts) == 1 and parts[0][0].isdigit():
return label_text, parts[0]

if len(parts) == 1:
return label_text, None
filename = filename.replace(pkg_name, "")
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using str.replace() on line 1023 could lead to incorrect results if pkg_name appears multiple times in the filename. For example, if pkg_name="test" and filename="testtest", this would result in an empty string. Consider using filename.removeprefix(pkg_name) (Python 3.9+) or filename[len(pkg_name):] if you specifically want to remove a prefix.

Suggested change
filename = filename.replace(pkg_name, "")
filename = filename[len(pkg_name):]

Copilot uses AI. Check for mistakes.
if not filename:
return None
return get_letter_and_number(filename)

if len(parts) == 2 and parts[-1][0].isdigit():
return parts[0], parts[-1]
def _extract_filename(self, href):
basename = os.path.basename(href)
filename, ext = os.path.splitext(basename)
return filename, ext

return None, None
def get_label_text_and_number_from_xref_text(self, xref_text, label_text):
# Tables 1-3
if not xref_text:
return None, None

parts = xref_text.split()

Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential IndexError when xref_text.split() returns an empty list. If xref_text contains only whitespace, split() will return an empty list, and accessing parts[-1] on line 1041 will raise an IndexError. Consider adding a check: if not parts: return None, None after line 1038.

Suggested change
if not parts:
return None, None

Copilot uses AI. Check for mistakes.
# first character of last part
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential IndexError when expected_number is an empty string. If for some reason parts[-1] is an empty string, accessing expected_number[0] on line 1042 will raise an IndexError. Consider adding a check: if not expected_number: return parts[0] if parts else None, None before line 1042.

Suggested change
# first character of last part
# first character of last part
if not parts or not parts[-1]:
return parts[0] if parts else None, None

Copilot uses AI. Check for mistakes.
expected_number = parts[-1]
if expected_number[0].isdigit():
if len(parts) == 2:
return parts[0], expected_number
if len(parts) == 1:
return label_text, expected_number
return parts[0], None

def get_element_name(self, label_text, rid, ext):
element_name = None
if label_text:
label_initial = label_text[0].lower()
element_name = LABEL_INITIAL_TO_ELEMENT.get(label_initial)
elif rid:
element_name = FILENAME_TO_ELEMENT.get(rid[0])
Comment on lines +1052 to +1056
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential IndexError if label_text or rid is an empty string. Before accessing label_text[0] on line 1052 or rid[0] on line 1055, you should verify these strings have at least one character. Empty strings would cause an IndexError.

Copilot uses AI. Check for mistakes.
if not element_name:
if ext in (".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".html", ".htm"):
element_name = "supplementary-material"
if not element_name:
element_name = "element"
return element_name

def parser_xref_parent(self, xref_parent, root, pkg_name):
label_text = None
Expand All @@ -1020,19 +1080,37 @@ def parser_xref_parent(self, xref_parent, root, pkg_name):
logging.error("XRefSpecialInternalLinkPipe - no href found")
continue

label_text, label_number = self.parse_xref_text(xref_text, label_text)
basename, ext = self._extract_filename(href)
child.set("filebasename", basename)

rid = self._extract_rid(href, pkg_name, label_text, label_number)
child.set("rid", rid)
element_name = ELEM_NAME.get(rid[0]) or "fig"
label_text, label_number = self.get_label_text_and_number_from_xref_text(xref_text, label_text)
rid = self.get_rid_from_xref_label_and_number(label_text, label_number)
if not rid:
rid = self.get_rid_from_href_and_pkg_name(href, pkg_name)
if rid:
child.set("rid-href", rid)

element_name = self.get_element_name(label_text, rid, ext)
child.set("ref-type", ELEM_AND_REF_TYPE.get(element_name) or element_name)
try:
found = root.xpath(f"//*[@id='{rid}']")[0]
xpath = f"//*[@filebasename='{basename}']"
if rid:
xpath = f"//*[@id='{rid}' | @filebasename='{basename}']"
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The XPath union operator | on line 1096 will match elements with either @id='{rid}' OR @filebasename='{basename}'. However, if an element has a different id but the same filebasename, it will still be matched. This could lead to matching the wrong element. Consider using and logic instead: //*[@id='{rid}' and @filebasename='{basename}'] or prioritizing the id match with a fallback query.

Suggested change
xpath = f"//*[@id='{rid}' | @filebasename='{basename}']"
xpath = f"//*[@id='{rid}' and @filebasename='{basename}']"

Copilot uses AI. Check for mistakes.
found = root.xpath(xpath)[0]
Comment on lines +1096 to +1099
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

XPath injection vulnerability: The basename variable is inserted directly into the XPath expression without sanitization. If basename contains special characters like single quotes, it could break the XPath query or potentially be exploited. Consider sanitizing basename or using parameterized queries if available. For example, if basename contains a single quote, the XPath query will fail.

Suggested change
xpath = f"//*[@filebasename='{basename}']"
if rid:
xpath = f"//*[@id='{rid}' | @filebasename='{basename}']"
found = root.xpath(xpath)[0]
if rid:
xpath = "//*[@id=$rid or @filebasename=$basename]"
found_nodes = root.xpath(xpath, rid=rid, basename=basename)
else:
xpath = "//*[@filebasename=$basename]"
found_nodes = root.xpath(xpath, basename=basename)
found = found_nodes[0]

Copilot uses AI. Check for mistakes.
if not found.get("filebasename"):
found.set("filebasename", basename)
if not found.get("id") and not found.get("id-href") and rid:
found.set("id-href", rid)

except IndexError:
new_elem = ET.Element(element_name)
new_elem.set("id", rid)
if rid:
new_elem.set("id-href", rid)
new_elem.set("filebasename", basename)

elem_label = ET.Element("label")
new_elem.append(elem_label)
elem_label.text = xref_text

g = ET.Element("graphic")
g.set("{http://www.w3.org/1999/xlink}href", href)
Expand All @@ -1042,7 +1120,9 @@ def parser_xref_parent(self, xref_parent, root, pkg_name):

child.attrib.pop("is_internal_link_to_asset_html_page")

for child in reversed(children):
# Sort children by rid before inserting
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment states "Sort children by rid before inserting" but the code actually sorts by filebasename. Either update the comment to reflect the actual sorting key or verify that sorting by filebasename is the intended behavior.

Suggested change
# Sort children by rid before inserting
# Sort children by filebasename before inserting

Copilot uses AI. Check for mistakes.
children_sorted = sorted(children, key=lambda x: x.get("filebasename"))
for child in children_sorted:
node = ET.Element(xref_parent.tag)
node.append(child)
xref_parent.addnext(node)
Expand Down Expand Up @@ -1157,14 +1237,19 @@ def parser_node(self, node):
graphic = sibling.find(".//graphic")
if graphic is None and table is None:
return

elem = None
if graphic is not None:
node.append(deepcopy(graphic))
elem = graphic
elif table is not None:
node.append(deepcopy(table))
elem = table
parent = elem.getparent()
parent.remove(elem)

if elem is not None:
parent = elem.getparent()
if parent is not None:
parent.remove(elem)

def transform(self, data):
raw, xml = data
Expand Down Expand Up @@ -1341,6 +1426,43 @@ def transform(self, data):
return data


class ReplaceIdhrefAndRidhrefByIdPipe(plumber.Pipe):
"""
Transforma div em table-wrap ou fig.
"""
def replace_rid_href_by_id(self, node, xml):
node_id = node.get("id")
filebasename = node.get("filebasename")
for xref in xml.xpath(f".//*[@rid and @filebasename='{filebasename}']"):
xref.set("rid", node_id)
xref.attrib.pop("rid-href", None)
xref.attrib.pop("filebasename", None)
node.attrib.pop("filebasename", None)
node.attrib.pop("id-href", None)

def create_rid_from_filebasename(self, node, rid=None):
node.set("rid", node.attrib.pop("filebasename"))
node.attrib.pop("rid-href", None)

def create_id_from_filebasename(self, node, xml):
node.set("id", node.attrib.pop("filebasename"))
node.attrib.pop("rid-href", None)

def transform(self, data):
raw, xml = data
for node in xml.xpath(".//*[@filebasename and @id]"):
self.replace_rid_href_by_id(node, xml)

for node in xml.xpath(".//*[not(@rid) and @rid-href and @filebasename]"):
self.create_rid_from_filebasename(node, xml)

for node in xml.xpath(".//*[not(@id) and @id-href and @filebasename]"):
self.create_id_from_filebasename(node, xml)


return data


class InsertCaptionAndTitleInTableWrapPipe(plumber.Pipe):
"""
Insere caption dentro de table-wrap.
Expand Down