scieloorg · robertatakenaka · Jan 9, 2026 · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
-lxml
+lxml==6.0.2
 picles.plumber==0.11
 black==23.3.0
diff --git a/scielo_classic_website/htmlbody/html_body.py b/scielo_classic_website/htmlbody/html_body.py
@@ -1,16 +1,11 @@
 import logging
 import os
-import sys
-from difflib import unified_diff
 from functools import cached_property
-from datetime import datetime
 
 from lxml import etree
-from lxml.etree import ParseError, register_namespace
-from lxml.html import fromstring, html_to_xhtml, iterlinks, rewrite_links, tostring
 
 from scielo_classic_website.htmlbody import html_fixer
-from scielo_classic_website.htmlbody.html_code_utils import html_safe_decode
+from scielo_classic_website.htmlbody.name2number import fix_pre_loading
 
 
 class UnableToGetHTMLTreeError(Exception): ...
@@ -25,11 +20,28 @@ class HTMLContent:
     """
 
     def __init__(self, content):
-        # for prefix, uri in HTML_WORD_NAMESPACES.items():
-        #     register_namespace(prefix, uri)
         self.original = content
-        self.fixed_or_original = html_fixer.get_html_fixed_or_original(content)
-        self.tree = self.fixed_or_original
+        self.best_choice = None
+        self.score = 0
+        self.fixed_html = None
+        self._tree = None
+
+        try:
+            self.fixed_html = html_fixer.get_fixed_html(content)
+            self.score = html_fixer.get_fixed_similarity_rate(
+                self.original, self.fixed_html
+            )
+            self.best_choice = html_fixer.get_best_choice_between_original_and_fixed(
+                self.score, self.original, self.fixed_html
+            )
+            if self.best_choice == "original":
+                self.tree = self.original
+            else:
+                self.tree = self.fixed_html
+        except Exception as e:
+            logging.exception(e)
+            logging.info((self.score, self.best_choice))
+            self.tree = self.original
-            logging.exception(e)
-            logging.info((self.score, self.best_choice))
-            self.tree = self.original
+            # Log the error that occurred while trying to build the tree,
+            # then attempt to fall back to the original content. If that also
+            # fails, raise a dedicated error to signal a critical failure.
+            logging.exception(e)
+            logging.info((self.score, self.best_choice))
+            try:
+                self.tree = self.original
+            except Exception as fallback_exc:
+                logging.exception(fallback_exc)
+                raise UnableToGetHTMLTreeError(
+                    "Unable to build HTML tree from either fixed or original content"
+                ) from fallback_exc
-            logging.exception(e)
-            logging.info((self.score, self.best_choice))
-            self.tree = self.original
+            # Log the error that occurred while trying to build the tree,
+            # then attempt to fall back to the original content. If that also
+            # fails, raise a dedicated error to signal a critical failure.
+            logging.exception(e)
+            logging.info((self.score, self.best_choice))
+            try:
+                self.tree = self.original
+            except Exception as fallback_exc:
+                logging.exception(fallback_exc)
+                raise UnableToGetHTMLTreeError(
+                    "Unable to build HTML tree from either fixed or original content"
+                ) from fallback_exc
 
     @staticmethod
     def create(file_path):
@@ -44,11 +56,14 @@ def create(file_path):
     @property
     def content(self):
         if self.tree is None:
+            logging.info("returning original content")
             return self.original
         try:
             self.fix_asset_paths()
             return html_fixer.html2xml(self.tree)
         except Exception as e:
+            logging.exception(e)
+            logging.info(f"returning original content due to exception: {e}")
             return self.original
 
     @property
@@ -165,29 +180,25 @@ def get_references_block(self):
 
 
 def get_paragraphs_text(p_records):
-    if not p_records:
-        return ""
-    texts = []
     for item in p_records:
         if not item.paragraph_text:
             continue
-        texts.append(item.paragraph_text)
-    return "".join(texts)
+        yield item.paragraph_text
 
 
 def get_text_block(paragraphs):
     if not paragraphs:
         return ""
-    try:
-        # corrige o bloco de parágrafos de uma vez
-        paragraphs_text = get_paragraphs_text(paragraphs)
-        hc = HTMLContent(paragraphs_text)
-        return hc.content
-    except Exception as e:
-        # corrige cada parágrafo individualmente
-        return get_text(get_paragraphs_data(paragraphs))
+    # corrige o bloco de parágrafos de uma vez
+    return "".join(get_paragraphs_text(paragraphs))
 
+    # hc = HTMLContent(paragraphs_text)
+    # return hc.content
+    # except Exception as e:
+    #     # corrige cada parágrafo individualmente
+    #     return get_text(get_paragraphs_data(paragraphs))
 
+
-    # hc = HTMLContent(paragraphs_text)
-    # return hc.content
-    # except Exception as e:
-    #     # corrige cada parágrafo individualmente
-    #     return get_text(get_paragraphs_data(paragraphs))
-
-    
-    # hc = HTMLContent(paragraphs_text)
-    # return hc.content
-    # except Exception as e:
-    #     # corrige cada parágrafo individualmente
-    #     return get_text(get_paragraphs_data(paragraphs))
-
-    
 def get_paragraphs_data(p_records, part_name=None):
     index = None
     for item in p_records:
@@ -236,18 +247,49 @@ def fix_paragraphs(p_records):
 
 
 def fix_references(p_records):
-    index = None
+    index = 0
     for item in p_records:
         # item.data (dict which keys: text, index, reference_index)
-        if index:
-            index += 1
-        elif item.reference_index:
-            index = int(item.reference_index)
-        text = item.paragraph_text
-        if text:
-            data = {}
-            data.update(item.data)
-            data["text"] = html_fixer.avoid_mismatched_tags(text)
-            if not item.reference_index:
-                data["guessed_reference_index"] = str(index)
-            yield data
+        text = (item.paragraph_text or "").strip()
+        if not text:
+            continue
+        node = html_to_node("mixed-citation", text)
+        node_text = etree.tostring(node, encoding="utf-8").decode("utf-8")
+        fixed_text = node_text.replace("<mixed-citation>", "").replace("</mixed-citation>", "").strip()
+        if not fixed_text:
+            continue
+        index += 1
+        data = {}
+        data.update(item.data)
+        if not item.reference_index:
+            data["guessed_reference_index"] = str(index)
+        yield data
+
+
+def html_to_node(element_name, children_data_as_text):
+    if not element_name:
+        raise ValueError("element_name cannot be empty")
+    if not children_data_as_text:
+        return etree.Element(element_name)
+
+    # padroniza entidades
+    fixed_html_entities = fix_pre_loading(children_data_as_text)
+    try:
+        return get_node_from_standardized_html(element_name, fixed_html_entities)
+    except Exception as e:
+        logging.exception(f"Tentativa 2: Error: {e}")
+        xml = html_fixer.remove_tags(fixed_html_entities, ["a", "img"])
+        return get_node_from_standardized_html(element_name, xml)
+
+
+def get_node_from_standardized_html(element_name, fixed_html_entities):
+    if element_name != "body":
+        fixed_html_entities = f"<{element_name}>{fixed_html_entities}</{element_name}>"
+    try:
+        hc = HTMLContent(fixed_html_entities)
+        node = hc.tree.find(f".//{element_name}")
+        if node is None:
+            raise ValueError(f"Unable to get node from html (node is None): {fixed_html_entities}")
+        return node
+    except Exception as e:
+        raise ValueError(f"Unable to get node from html (exception occurred: {e}): {fixed_html_entities}")
diff --git a/scielo_classic_website/htmlbody/html_fixer.py b/scielo_classic_website/htmlbody/html_fixer.py
@@ -1,8 +1,6 @@
 import logging
-from datetime import datetime
 from difflib import SequenceMatcher
 
-
 from lxml.html import fromstring, tostring
 
 
@@ -20,34 +18,23 @@
 DEFAULT_TAGS_TO_FIX = ("p", )
 
 
-def get_html_fixed_or_original(original):
-    fixed_html = get_fixed_html(original)
-
+def get_fixed_similarity_rate(original, fixed_html):
+    """Verifica se o HTML corrigido é válido comparando com o original."""
     tagless_html = remove_tags(fixed_html)
     tagless_text = get_fixed_text(original)
 
     std_converted = tagless_html.split()
     std_original = tagless_text.split()
 
-    score = SequenceMatcher(None, std_original, std_converted).ratio()
-    if score == 1:
-        return fixed_html
+    return SequenceMatcher(None, std_original, std_converted).ratio()
 
-    # now = datetime.now().isoformat()
 
-    # rows = [
-    #     original,
-    #     tagless_html,
-    #     tagless_text,
-    # ]
-
-    # with open(f"_diff_{now}.txt", "w") as fp:
-    #     fp.write("\n--------\n".join(rows))
-
-    if score > 0.7:
-        return fixed_html
-
-    return original
+def get_best_choice_between_original_and_fixed(score, original, fixed_html, min_score=0.7):
+    if score == 1:
+        return "fixed_html"
+    if score > min_score:
+        return "fixed_html"
+    return "original"
 
 
 def load_html(content):
@@ -69,7 +56,6 @@ def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namesp
     """
     style_mappings = style_mappings or DEFAULT_STYLE_MAPPINGS
     tags_to_fix = tags_to_fix or DEFAULT_TAGS_TO_FIX
-
     fixed_content = fix(content, style_mappings, tags_to_fix)
     wrapped = wrap_html(fixed_content)
     tree = fromstring(wrapped)
@@ -130,12 +116,12 @@ def avoid_mismatched_styles(content, style_mappings=None):
 
     for tag, style in style_mappings.items():
         # Tags minúsculas
-        content = content.replace(f"<{tag}>", f'<span name="style_{style}">')
+        content = content.replace(f"<{tag}>", f'<span style="{style}">')
         content = content.replace(f"</{tag}>", '</span>')
 
         # Tags maiúsculas
         tag_upper = tag.upper()
-        content = content.replace(f"<{tag_upper}>", f'<span name="style_{style}">')
+        content = content.replace(f"<{tag_upper}>", f'<span style="{style}">')
         content = content.replace(f"</{tag_upper}>", '</span>')
 
     return content
@@ -219,7 +205,7 @@ def get_no_namespaces(content):
         yield item
 
 
-def remove_tags(content):
+def remove_tags(content, skip_tags=None):
     """
     Remove todas as tags HTML do conteúdo.
 
@@ -229,10 +215,10 @@ def remove_tags(content):
     Returns:
         Conteúdo sem tags
     """
-    return "".join(get_tagless_items(content))
+    return "".join(get_tagless_items(content, skip_tags=skip_tags))
 
 
-def get_tagless_items(content):
+def get_tagless_items(content, skip_tags=None):
     """
     Gerador que retorna apenas o conteúdo sem tags HTML.
 
@@ -242,10 +228,21 @@ def get_tagless_items(content):
     Yields:
         Partes do conteúdo sem tags
     """
+    tags_to_skip = []
+    for tag in (skip_tags or []):
+        tags_to_skip.append(f"<{tag}")
+        tags_to_skip.append(f"</{tag}>")
+
     for item in break_content(content):
-        if (item or "").strip():
-            if item and item[0] == "<" and item[-1] == ">":
-                continue
+        if not item:
+            yield item
+            continue
+        if item[0] == "<" and item[-1] == ">":
+            for tag in tags_to_skip:
+                if item.startswith(tag):
+                    yield item
+                    break
+            continue
         yield item.replace("<", "&lt;").replace(">", "&gt;")
 
 
@@ -300,13 +297,19 @@ def tag_has_namespace(tag):
     """
     if ":" not in tag:
         return False
-    for part in tag.split():
-        if ":" in part.split("=")[0]:
+
+    tag = tag.replace('="', '-ATTRVALUE-BEGIN')
+    tag = tag.replace('"', "END-ATTRVALUE-')")
+    items = tag.split("-ATTRVALUE-")
+    for item in items:
+        if item.startswith("BEGIN") and item.endswith("END"):
+            continue
+        if ":" in item:
             return True
     return False
 
 
-def html2xml(tree):
+def html2xml(tree, extra=None):
     """
     Converte a árvore HTML para string XML.
 
@@ -318,9 +321,14 @@ def html2xml(tree):
     """
     body = tree.find(".//body")
     try:
-        content = tostring(body, method="xml", encoding="utf-8").decode("utf-8")
+        content = tostring(body, method="html", encoding="utf-8").decode("utf-8")
+        x = content[:content.find(">")+1]
+        if not x.startswith("<body"):
+            raise ValueError(f"Tag <body> não encontrada corretamente. {x}")
         content = content[content.find(">")+1:]
         content = content[:content.rfind("</body>")]
+        if extra:
+            content = extra + content
     except Exception as e:
         logging.exception(e)
         raise