-
Notifications
You must be signed in to change notification settings - Fork 3
Corrige e melhora conversao html para xml #137
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
fb14ee0
8915f2d
56dca6f
58e26a4
e14ab07
20e897a
4e7756f
74a8e43
9ebea57
c415f81
93ed605
6aed010
293d58e
4937bb9
2905ea5
d4c46a2
d98b880
e8ebe5f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,3 @@ | ||
| lxml | ||
| lxml==6.0.2 | ||
| picles.plumber==0.11 | ||
| black==23.3.0 |
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,16 +1,11 @@ | ||||||||||||||||
| import logging | ||||||||||||||||
| import os | ||||||||||||||||
| import sys | ||||||||||||||||
| from difflib import unified_diff | ||||||||||||||||
| from functools import cached_property | ||||||||||||||||
| from datetime import datetime | ||||||||||||||||
|
|
||||||||||||||||
| from lxml import etree | ||||||||||||||||
| from lxml.etree import ParseError, register_namespace | ||||||||||||||||
| from lxml.html import fromstring, html_to_xhtml, iterlinks, rewrite_links, tostring | ||||||||||||||||
|
|
||||||||||||||||
| from scielo_classic_website.htmlbody import html_fixer | ||||||||||||||||
| from scielo_classic_website.htmlbody.html_code_utils import html_safe_decode | ||||||||||||||||
| from scielo_classic_website.htmlbody.name2number import fix_pre_loading | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| class UnableToGetHTMLTreeError(Exception): ... | ||||||||||||||||
|
|
@@ -25,11 +20,28 @@ class HTMLContent: | |||||||||||||||
| """ | ||||||||||||||||
|
|
||||||||||||||||
| def __init__(self, content): | ||||||||||||||||
| # for prefix, uri in HTML_WORD_NAMESPACES.items(): | ||||||||||||||||
| # register_namespace(prefix, uri) | ||||||||||||||||
| self.original = content | ||||||||||||||||
| self.fixed_or_original = html_fixer.get_html_fixed_or_original(content) | ||||||||||||||||
| self.tree = self.fixed_or_original | ||||||||||||||||
| self.best_choice = None | ||||||||||||||||
| self.score = 0 | ||||||||||||||||
| self.fixed_html = None | ||||||||||||||||
| self._tree = None | ||||||||||||||||
|
|
||||||||||||||||
| try: | ||||||||||||||||
| self.fixed_html = html_fixer.get_fixed_html(content) | ||||||||||||||||
| self.score = html_fixer.get_fixed_similarity_rate( | ||||||||||||||||
| self.original, self.fixed_html | ||||||||||||||||
| ) | ||||||||||||||||
| self.best_choice = html_fixer.get_best_choice_between_original_and_fixed( | ||||||||||||||||
| self.score, self.original, self.fixed_html | ||||||||||||||||
| ) | ||||||||||||||||
| if self.best_choice == "original": | ||||||||||||||||
| self.tree = self.original | ||||||||||||||||
| else: | ||||||||||||||||
| self.tree = self.fixed_html | ||||||||||||||||
| except Exception as e: | ||||||||||||||||
| logging.exception(e) | ||||||||||||||||
| logging.info((self.score, self.best_choice)) | ||||||||||||||||
| self.tree = self.original | ||||||||||||||||
|
|
||||||||||||||||
| @staticmethod | ||||||||||||||||
| def create(file_path): | ||||||||||||||||
|
|
@@ -44,11 +56,14 @@ def create(file_path): | |||||||||||||||
| @property | ||||||||||||||||
| def content(self): | ||||||||||||||||
| if self.tree is None: | ||||||||||||||||
| logging.info("returning original content") | ||||||||||||||||
| return self.original | ||||||||||||||||
| try: | ||||||||||||||||
| self.fix_asset_paths() | ||||||||||||||||
| return html_fixer.html2xml(self.tree) | ||||||||||||||||
| except Exception as e: | ||||||||||||||||
| logging.exception(e) | ||||||||||||||||
| logging.info(f"returning original content due to exception: {e}") | ||||||||||||||||
| return self.original | ||||||||||||||||
|
|
||||||||||||||||
| @property | ||||||||||||||||
|
|
@@ -165,29 +180,25 @@ def get_references_block(self): | |||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| def get_paragraphs_text(p_records): | ||||||||||||||||
| if not p_records: | ||||||||||||||||
| return "" | ||||||||||||||||
| texts = [] | ||||||||||||||||
| for item in p_records: | ||||||||||||||||
| if not item.paragraph_text: | ||||||||||||||||
| continue | ||||||||||||||||
| texts.append(item.paragraph_text) | ||||||||||||||||
| return "".join(texts) | ||||||||||||||||
| yield item.paragraph_text | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| def get_text_block(paragraphs): | ||||||||||||||||
| if not paragraphs: | ||||||||||||||||
| return "" | ||||||||||||||||
| try: | ||||||||||||||||
| # corrige o bloco de parágrafos de uma vez | ||||||||||||||||
| paragraphs_text = get_paragraphs_text(paragraphs) | ||||||||||||||||
| hc = HTMLContent(paragraphs_text) | ||||||||||||||||
| return hc.content | ||||||||||||||||
| except Exception as e: | ||||||||||||||||
| # corrige cada parágrafo individualmente | ||||||||||||||||
| return get_text(get_paragraphs_data(paragraphs)) | ||||||||||||||||
| # corrige o bloco de parágrafos de uma vez | ||||||||||||||||
| return "".join(get_paragraphs_text(paragraphs)) | ||||||||||||||||
|
|
||||||||||||||||
| # hc = HTMLContent(paragraphs_text) | ||||||||||||||||
| # return hc.content | ||||||||||||||||
| # except Exception as e: | ||||||||||||||||
| # # corrige cada parágrafo individualmente | ||||||||||||||||
| # return get_text(get_paragraphs_data(paragraphs)) | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
|
Comment on lines
+195
to
+201
|
||||||||||||||||
| # hc = HTMLContent(paragraphs_text) | |
| # return hc.content | |
| # except Exception as e: | |
| # # corrige cada parágrafo individualmente | |
| # return get_text(get_paragraphs_data(paragraphs)) | |
Copilot
AI
Jan 9, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The function fix_references changed its logic significantly. Previously it tracked reference indices and could infer them, but now it simply increments a counter starting from 0. The new logic processes text through html_to_node which could fail, but there's no error handling. If html_to_node raises an exception, the entire fix_references generator will fail. Consider adding try-except around the html_to_node call.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,6 @@ | ||
| import logging | ||
| from datetime import datetime | ||
| from difflib import SequenceMatcher | ||
|
|
||
|
|
||
| from lxml.html import fromstring, tostring | ||
|
|
||
|
|
||
|
|
@@ -20,34 +18,23 @@ | |
| DEFAULT_TAGS_TO_FIX = ("p", ) | ||
|
|
||
|
|
||
| def get_html_fixed_or_original(original): | ||
| fixed_html = get_fixed_html(original) | ||
|
|
||
| def get_fixed_similarity_rate(original, fixed_html): | ||
| """Verifica se o HTML corrigido é válido comparando com o original.""" | ||
| tagless_html = remove_tags(fixed_html) | ||
| tagless_text = get_fixed_text(original) | ||
|
|
||
| std_converted = tagless_html.split() | ||
| std_original = tagless_text.split() | ||
|
|
||
| score = SequenceMatcher(None, std_original, std_converted).ratio() | ||
| if score == 1: | ||
| return fixed_html | ||
| return SequenceMatcher(None, std_original, std_converted).ratio() | ||
|
|
||
| # now = datetime.now().isoformat() | ||
|
|
||
| # rows = [ | ||
| # original, | ||
| # tagless_html, | ||
| # tagless_text, | ||
| # ] | ||
|
|
||
| # with open(f"_diff_{now}.txt", "w") as fp: | ||
| # fp.write("\n--------\n".join(rows)) | ||
|
|
||
| if score > 0.7: | ||
| return fixed_html | ||
|
|
||
| return original | ||
| def get_best_choice_between_original_and_fixed(score, original, fixed_html, min_score=0.7): | ||
| if score == 1: | ||
| return "fixed_html" | ||
| if score > min_score: | ||
| return "fixed_html" | ||
| return "original" | ||
|
|
||
|
|
||
| def load_html(content): | ||
|
|
@@ -69,7 +56,6 @@ def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namesp | |
| """ | ||
| style_mappings = style_mappings or DEFAULT_STYLE_MAPPINGS | ||
| tags_to_fix = tags_to_fix or DEFAULT_TAGS_TO_FIX | ||
|
|
||
| fixed_content = fix(content, style_mappings, tags_to_fix) | ||
| wrapped = wrap_html(fixed_content) | ||
| tree = fromstring(wrapped) | ||
|
|
@@ -130,12 +116,12 @@ def avoid_mismatched_styles(content, style_mappings=None): | |
|
|
||
| for tag, style in style_mappings.items(): | ||
| # Tags minúsculas | ||
| content = content.replace(f"<{tag}>", f'<span name="style_{style}">') | ||
| content = content.replace(f"<{tag}>", f'<span style="{style}">') | ||
| content = content.replace(f"</{tag}>", '</span>') | ||
|
|
||
| # Tags maiúsculas | ||
| tag_upper = tag.upper() | ||
| content = content.replace(f"<{tag_upper}>", f'<span name="style_{style}">') | ||
| content = content.replace(f"<{tag_upper}>", f'<span style="{style}">') | ||
| content = content.replace(f"</{tag_upper}>", '</span>') | ||
|
|
||
| return content | ||
|
|
@@ -219,7 +205,7 @@ def get_no_namespaces(content): | |
| yield item | ||
|
|
||
|
|
||
| def remove_tags(content): | ||
| def remove_tags(content, skip_tags=None): | ||
| """ | ||
| Remove todas as tags HTML do conteúdo. | ||
|
|
||
|
|
@@ -229,10 +215,10 @@ def remove_tags(content): | |
| Returns: | ||
| Conteúdo sem tags | ||
| """ | ||
| return "".join(get_tagless_items(content)) | ||
| return "".join(get_tagless_items(content, skip_tags=skip_tags)) | ||
|
|
||
|
|
||
| def get_tagless_items(content): | ||
| def get_tagless_items(content, skip_tags=None): | ||
| """ | ||
| Gerador que retorna apenas o conteúdo sem tags HTML. | ||
|
|
||
|
|
@@ -242,10 +228,21 @@ def get_tagless_items(content): | |
| Yields: | ||
| Partes do conteúdo sem tags | ||
| """ | ||
| tags_to_skip = [] | ||
| for tag in (skip_tags or []): | ||
| tags_to_skip.append(f"<{tag}") | ||
| tags_to_skip.append(f"</{tag}>") | ||
|
|
||
| for item in break_content(content): | ||
| if (item or "").strip(): | ||
| if item and item[0] == "<" and item[-1] == ">": | ||
| continue | ||
| if not item: | ||
| yield item | ||
| continue | ||
| if item[0] == "<" and item[-1] == ">": | ||
| for tag in tags_to_skip: | ||
| if item.startswith(tag): | ||
| yield item | ||
| break | ||
| continue | ||
| yield item.replace("<", "<").replace(">", ">") | ||
|
|
||
|
|
||
|
|
@@ -300,13 +297,19 @@ def tag_has_namespace(tag): | |
| """ | ||
| if ":" not in tag: | ||
| return False | ||
| for part in tag.split(): | ||
| if ":" in part.split("=")[0]: | ||
|
|
||
| tag = tag.replace('="', '-ATTRVALUE-BEGIN') | ||
| tag = tag.replace('"', "END-ATTRVALUE-')") | ||
| items = tag.split("-ATTRVALUE-") | ||
| for item in items: | ||
| if item.startswith("BEGIN") and item.endswith("END"): | ||
| continue | ||
| if ":" in item: | ||
| return True | ||
| return False | ||
|
Comment on lines
297
to
309
|
||
|
|
||
|
|
||
| def html2xml(tree): | ||
| def html2xml(tree, extra=None): | ||
|
||
| """ | ||
| Converte a árvore HTML para string XML. | ||
|
|
||
|
|
@@ -318,9 +321,14 @@ def html2xml(tree): | |
| """ | ||
| body = tree.find(".//body") | ||
| try: | ||
| content = tostring(body, method="xml", encoding="utf-8").decode("utf-8") | ||
| content = tostring(body, method="html", encoding="utf-8").decode("utf-8") | ||
|
||
| x = content[:content.find(">")+1] | ||
| if not x.startswith("<body"): | ||
| raise ValueError(f"Tag <body> não encontrada corretamente. {x}") | ||
| content = content[content.find(">")+1:] | ||
| content = content[:content.rfind("</body>")] | ||
| if extra: | ||
| content = extra + content | ||
| except Exception as e: | ||
| logging.exception(e) | ||
| raise | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The HTMLContent class constructor now has complex exception handling that catches all exceptions and falls back to using the original content. While this provides robustness, it silently swallows all errors except for logging. Consider whether some exceptions should be re-raised or handled differently, especially critical errors that indicate fundamental problems with the input.