Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
fb14ee0
fix(article_meta): simplifica busca de elementos corresp no XML
robertatakenaka Dec 22, 2025
8915f2d
refactor(refs): otimiza processamento de citações e tratamento de erros
robertatakenaka Dec 22, 2025
56dca6f
feat(xml_pipes): aprimora conversão de sup para xref e detecção de re…
robertatakenaka Dec 22, 2025
58e26a4
refactor(body_pipes): reestrutura pipeline de conversão HTML para XML…
robertatakenaka Dec 22, 2025
e14ab07
Merge branch 'main' into corrige_conversao_html_para_xml
robertatakenaka Dec 22, 2025
20e897a
fix: especifica versão do lxml para garantir compatibilidade
robertatakenaka Jan 9, 2026
4e7756f
refactor: melhora detecção de tags com namespace e correção de HTML
robertatakenaka Jan 9, 2026
74a8e43
refactor: simplifica processamento de HTML e referências
robertatakenaka Jan 9, 2026
9ebea57
fix: corrige XPaths e adiciona suporte a plurais nos padrões
robertatakenaka Jan 9, 2026
c415f81
fix: ajusta mapeamento de elementos e remove patterns ambíguos
robertatakenaka Jan 9, 2026
93ed605
feat: adiciona método get_license_text_by_language para facilitar acesso
robertatakenaka Jan 9, 2026
6aed010
chore: remove import não utilizado de Reference
robertatakenaka Jan 9, 2026
293d58e
refactor: melhora processamento de author-notes e permissions
robertatakenaka Jan 9, 2026
4937bb9
feat: adiciona módulo html_merger para mesclar conteúdo HTML
robertatakenaka Jan 9, 2026
2905ea5
refactor: grande refatoração do processamento de body e back
robertatakenaka Jan 9, 2026
d4c46a2
refactor: reorganiza pipeline e adiciona XMLAckPipe
robertatakenaka Jan 9, 2026
d98b880
fix(sps_xml_refs): melhorar tratamento de referências e adicionar Pub…
robertatakenaka Jan 9, 2026
e8ebe5f
fix(sps_xml_body_pipes): aprimorar validação e remoção de referências…
robertatakenaka Jan 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
lxml
lxml==6.0.2
picles.plumber==0.11
black==23.3.0
114 changes: 78 additions & 36 deletions scielo_classic_website/htmlbody/html_body.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
import logging
import os
import sys
from difflib import unified_diff
from functools import cached_property
from datetime import datetime

from lxml import etree
from lxml.etree import ParseError, register_namespace
from lxml.html import fromstring, html_to_xhtml, iterlinks, rewrite_links, tostring

from scielo_classic_website.htmlbody import html_fixer
from scielo_classic_website.htmlbody.html_code_utils import html_safe_decode
from scielo_classic_website.htmlbody.name2number import fix_pre_loading


class UnableToGetHTMLTreeError(Exception): ...
Expand All @@ -25,11 +20,28 @@ class HTMLContent:
"""

def __init__(self, content):
# for prefix, uri in HTML_WORD_NAMESPACES.items():
# register_namespace(prefix, uri)
self.original = content
self.fixed_or_original = html_fixer.get_html_fixed_or_original(content)
self.tree = self.fixed_or_original
self.best_choice = None
self.score = 0
self.fixed_html = None
self._tree = None

try:
self.fixed_html = html_fixer.get_fixed_html(content)
self.score = html_fixer.get_fixed_similarity_rate(
self.original, self.fixed_html
)
self.best_choice = html_fixer.get_best_choice_between_original_and_fixed(
self.score, self.original, self.fixed_html
)
if self.best_choice == "original":
self.tree = self.original
else:
self.tree = self.fixed_html
except Exception as e:
logging.exception(e)
logging.info((self.score, self.best_choice))
self.tree = self.original
Comment on lines +42 to +44
Copy link

Copilot AI Jan 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The HTMLContent class constructor now has complex exception handling that catches all exceptions and falls back to using the original content. While this provides robustness, it silently swallows all errors except for logging. Consider whether some exceptions should be re-raised or handled differently, especially critical errors that indicate fundamental problems with the input.

Suggested change
logging.exception(e)
logging.info((self.score, self.best_choice))
self.tree = self.original
# Log the error that occurred while trying to build the tree,
# then attempt to fall back to the original content. If that also
# fails, raise a dedicated error to signal a critical failure.
logging.exception(e)
logging.info((self.score, self.best_choice))
try:
self.tree = self.original
except Exception as fallback_exc:
logging.exception(fallback_exc)
raise UnableToGetHTMLTreeError(
"Unable to build HTML tree from either fixed or original content"
) from fallback_exc

Copilot uses AI. Check for mistakes.

@staticmethod
def create(file_path):
Expand All @@ -44,11 +56,14 @@ def create(file_path):
@property
def content(self):
if self.tree is None:
logging.info("returning original content")
return self.original
try:
self.fix_asset_paths()
return html_fixer.html2xml(self.tree)
except Exception as e:
logging.exception(e)
logging.info(f"returning original content due to exception: {e}")
return self.original

@property
Expand Down Expand Up @@ -165,29 +180,25 @@ def get_references_block(self):


def get_paragraphs_text(p_records):
if not p_records:
return ""
texts = []
for item in p_records:
if not item.paragraph_text:
continue
texts.append(item.paragraph_text)
return "".join(texts)
yield item.paragraph_text


def get_text_block(paragraphs):
if not paragraphs:
return ""
try:
# corrige o bloco de parágrafos de uma vez
paragraphs_text = get_paragraphs_text(paragraphs)
hc = HTMLContent(paragraphs_text)
return hc.content
except Exception as e:
# corrige cada parágrafo individualmente
return get_text(get_paragraphs_data(paragraphs))
# corrige o bloco de parágrafos de uma vez
return "".join(get_paragraphs_text(paragraphs))

# hc = HTMLContent(paragraphs_text)
# return hc.content
# except Exception as e:
# # corrige cada parágrafo individualmente
# return get_text(get_paragraphs_data(paragraphs))


Comment on lines +195 to +201
Copy link

Copilot AI Jan 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function get_text_block has been significantly simplified, removing the HTMLContent processing. The commented-out code on lines 195-199 suggests this was intentional, but there's no explanation for why this major change was made. If the HTML content processing is no longer needed, the commented code should be removed. If it might be needed again, add a comment explaining the reasoning.

Suggested change
# hc = HTMLContent(paragraphs_text)
# return hc.content
# except Exception as e:
# # corrige cada parágrafo individualmente
# return get_text(get_paragraphs_data(paragraphs))

Copilot uses AI. Check for mistakes.
def get_paragraphs_data(p_records, part_name=None):
index = None
for item in p_records:
Expand Down Expand Up @@ -236,18 +247,49 @@ def fix_paragraphs(p_records):


def fix_references(p_records):
index = None
index = 0
for item in p_records:
# item.data (dict which keys: text, index, reference_index)
if index:
index += 1
elif item.reference_index:
index = int(item.reference_index)
text = item.paragraph_text
if text:
data = {}
data.update(item.data)
data["text"] = html_fixer.avoid_mismatched_tags(text)
if not item.reference_index:
data["guessed_reference_index"] = str(index)
yield data
text = (item.paragraph_text or "").strip()
if not text:
continue
node = html_to_node("mixed-citation", text)
node_text = etree.tostring(node, encoding="utf-8").decode("utf-8")
fixed_text = node_text.replace("<mixed-citation>", "").replace("</mixed-citation>", "").strip()
if not fixed_text:
continue
index += 1
data = {}
data.update(item.data)
if not item.reference_index:
data["guessed_reference_index"] = str(index)
yield data
Comment on lines 249 to +266
Copy link

Copilot AI Jan 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function fix_references changed its logic significantly. Previously it tracked reference indices and could infer them, but now it simply increments a counter starting from 0. The new logic processes text through html_to_node which could fail, but there's no error handling. If html_to_node raises an exception, the entire fix_references generator will fail. Consider adding try-except around the html_to_node call.

Copilot uses AI. Check for mistakes.


def html_to_node(element_name, children_data_as_text):
if not element_name:
raise ValueError("element_name cannot be empty")
if not children_data_as_text:
return etree.Element(element_name)

# padroniza entidades
fixed_html_entities = fix_pre_loading(children_data_as_text)
try:
return get_node_from_standardized_html(element_name, fixed_html_entities)
except Exception as e:
logging.exception(f"Tentativa 2: Error: {e}")
xml = html_fixer.remove_tags(fixed_html_entities, ["a", "img"])
return get_node_from_standardized_html(element_name, xml)


def get_node_from_standardized_html(element_name, fixed_html_entities):
if element_name != "body":
fixed_html_entities = f"<{element_name}>{fixed_html_entities}</{element_name}>"
try:
hc = HTMLContent(fixed_html_entities)
node = hc.tree.find(f".//{element_name}")
if node is None:
raise ValueError(f"Unable to get node from html (node is None): {fixed_html_entities}")
return node
except Exception as e:
raise ValueError(f"Unable to get node from html (exception occurred: {e}): {fixed_html_entities}")
78 changes: 43 additions & 35 deletions scielo_classic_website/htmlbody/html_fixer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import logging
from datetime import datetime
from difflib import SequenceMatcher


from lxml.html import fromstring, tostring


Expand All @@ -20,34 +18,23 @@
DEFAULT_TAGS_TO_FIX = ("p", )


def get_html_fixed_or_original(original):
fixed_html = get_fixed_html(original)

def get_fixed_similarity_rate(original, fixed_html):
"""Verifica se o HTML corrigido é válido comparando com o original."""
tagless_html = remove_tags(fixed_html)
tagless_text = get_fixed_text(original)

std_converted = tagless_html.split()
std_original = tagless_text.split()

score = SequenceMatcher(None, std_original, std_converted).ratio()
if score == 1:
return fixed_html
return SequenceMatcher(None, std_original, std_converted).ratio()

# now = datetime.now().isoformat()

# rows = [
# original,
# tagless_html,
# tagless_text,
# ]

# with open(f"_diff_{now}.txt", "w") as fp:
# fp.write("\n--------\n".join(rows))

if score > 0.7:
return fixed_html

return original
def get_best_choice_between_original_and_fixed(score, original, fixed_html, min_score=0.7):
if score == 1:
return "fixed_html"
if score > min_score:
return "fixed_html"
return "original"


def load_html(content):
Expand All @@ -69,7 +56,6 @@ def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namesp
"""
style_mappings = style_mappings or DEFAULT_STYLE_MAPPINGS
tags_to_fix = tags_to_fix or DEFAULT_TAGS_TO_FIX

fixed_content = fix(content, style_mappings, tags_to_fix)
wrapped = wrap_html(fixed_content)
tree = fromstring(wrapped)
Expand Down Expand Up @@ -130,12 +116,12 @@ def avoid_mismatched_styles(content, style_mappings=None):

for tag, style in style_mappings.items():
# Tags minúsculas
content = content.replace(f"<{tag}>", f'<span name="style_{style}">')
content = content.replace(f"<{tag}>", f'<span style="{style}">')
content = content.replace(f"</{tag}>", '</span>')

# Tags maiúsculas
tag_upper = tag.upper()
content = content.replace(f"<{tag_upper}>", f'<span name="style_{style}">')
content = content.replace(f"<{tag_upper}>", f'<span style="{style}">')
content = content.replace(f"</{tag_upper}>", '</span>')

return content
Expand Down Expand Up @@ -219,7 +205,7 @@ def get_no_namespaces(content):
yield item


def remove_tags(content):
def remove_tags(content, skip_tags=None):
"""
Remove todas as tags HTML do conteúdo.

Expand All @@ -229,10 +215,10 @@ def remove_tags(content):
Returns:
Conteúdo sem tags
"""
return "".join(get_tagless_items(content))
return "".join(get_tagless_items(content, skip_tags=skip_tags))


def get_tagless_items(content):
def get_tagless_items(content, skip_tags=None):
"""
Gerador que retorna apenas o conteúdo sem tags HTML.

Expand All @@ -242,10 +228,21 @@ def get_tagless_items(content):
Yields:
Partes do conteúdo sem tags
"""
tags_to_skip = []
for tag in (skip_tags or []):
tags_to_skip.append(f"<{tag}")
tags_to_skip.append(f"</{tag}>")

for item in break_content(content):
if (item or "").strip():
if item and item[0] == "<" and item[-1] == ">":
continue
if not item:
yield item
continue
if item[0] == "<" and item[-1] == ">":
for tag in tags_to_skip:
if item.startswith(tag):
yield item
break
continue
yield item.replace("<", "&lt;").replace(">", "&gt;")


Expand Down Expand Up @@ -300,13 +297,19 @@ def tag_has_namespace(tag):
"""
if ":" not in tag:
return False
for part in tag.split():
if ":" in part.split("=")[0]:

tag = tag.replace('="', '-ATTRVALUE-BEGIN')
tag = tag.replace('"', "END-ATTRVALUE-')")
items = tag.split("-ATTRVALUE-")
for item in items:
if item.startswith("BEGIN") and item.endswith("END"):
continue
if ":" in item:
return True
return False
Comment on lines 297 to 309
Copy link

Copilot AI Jan 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function tag_has_namespace has been refactored with string replacements using unconventional delimiters like "-ATTRVALUE-BEGIN". While functional, this approach is fragile and hard to understand. Consider using a more robust parsing approach, such as regex or proper XML attribute parsing, to check for namespaces in tags.

Copilot uses AI. Check for mistakes.


def html2xml(tree):
def html2xml(tree, extra=None):
Copy link

Copilot AI Jan 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method signature for html2xml changed to add an extra parameter, but there's no documentation explaining what this parameter is for or when it should be used. Add a docstring parameter description for extra.

Copilot uses AI. Check for mistakes.
"""
Converte a árvore HTML para string XML.

Expand All @@ -318,9 +321,14 @@ def html2xml(tree):
"""
body = tree.find(".//body")
try:
content = tostring(body, method="xml", encoding="utf-8").decode("utf-8")
content = tostring(body, method="html", encoding="utf-8").decode("utf-8")
Copy link

Copilot AI Jan 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method changed from using "xml" to "html" serialization method on line 324, but the function is still called html2xml. This is confusing and could lead to misunderstandings about what the function does. Consider renaming the function or adding a comment explaining why HTML serialization is used instead of XML.

Copilot uses AI. Check for mistakes.
x = content[:content.find(">")+1]
if not x.startswith("<body"):
raise ValueError(f"Tag <body> não encontrada corretamente. {x}")
content = content[content.find(">")+1:]
content = content[:content.rfind("</body>")]
if extra:
content = extra + content
except Exception as e:
logging.exception(e)
raise
Expand Down
Loading
Loading