-
Notifications
You must be signed in to change notification settings - Fork 3
refactor(issue_folder): otimiza leitura de arquivos e melhora tratame… #134
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -4,7 +4,6 @@ | |||||||||||
| from datetime import datetime | ||||||||||||
|
|
||||||||||||
| from scielo_classic_website.htmlbody.html_body import HTMLContent | ||||||||||||
| from scielo_classic_website.isisdb.isis_cmd import get_documents_by_issue_folder | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def try_to_fix_encoding(nome_original): | ||||||||||||
|
|
@@ -41,48 +40,43 @@ def _get_classic_website_rel_path(file_path): | |||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def fixed_glob(patterns, file_type, recursive): | ||||||||||||
| paths = set() | ||||||||||||
| for pattern in patterns: | ||||||||||||
| for path in glob.glob(pattern, recursive=recursive): | ||||||||||||
| try: | ||||||||||||
| item = { | ||||||||||||
| "type": file_type, | ||||||||||||
| } | ||||||||||||
| item["path"] = path | ||||||||||||
| with open(path, "rb") as f: | ||||||||||||
| item["content"] = f.read() | ||||||||||||
| item["modified_date"] = modified_date(path) | ||||||||||||
| except Exception as e: | ||||||||||||
| logging.exception(e) | ||||||||||||
| item["error"] = str(e) | ||||||||||||
| item["error_type"] = type(e).__name__ | ||||||||||||
| yield item | ||||||||||||
| paths.update( | ||||||||||||
| glob.glob(pattern, recursive=recursive) | ||||||||||||
| ) | ||||||||||||
| return paths | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def get_files(patterns, file_type, recursive=False): | ||||||||||||
| for item in fixed_glob(patterns, file_type, recursive): | ||||||||||||
| for path in fixed_glob(patterns, file_type, recursive): | ||||||||||||
| try: | ||||||||||||
| if not item.get("error"): | ||||||||||||
| path = item["path"] | ||||||||||||
| item["name"] = os.path.basename(path) | ||||||||||||
| item["key"], item["extension"] = os.path.splitext(item["name"]) | ||||||||||||
| item["type"] = file_type | ||||||||||||
| item["relative_path"] = _get_classic_website_rel_path(path) | ||||||||||||
| yield item | ||||||||||||
|
|
||||||||||||
| item = { | ||||||||||||
| "type": file_type, | ||||||||||||
| "path": path, | ||||||||||||
| "modified_date": modified_date(path), | ||||||||||||
| "name": os.path.basename(path), | ||||||||||||
| "relative_path": _get_classic_website_rel_path(path), | ||||||||||||
| } | ||||||||||||
| item["key"], item["extension"] = os.path.splitext(item["name"]) | ||||||||||||
| with open(path, "rb") as f: | ||||||||||||
| item["content"] = f.read() | ||||||||||||
| except Exception as e: | ||||||||||||
| yield {"type": file_type, "error": str(e), "error_type": type(e).__name__} | ||||||||||||
| logging.exception(e) | ||||||||||||
| item["error"] = str(e) | ||||||||||||
| item["error_type"] = type(e).__name__ | ||||||||||||
|
Comment on lines
64
to
+67
|
||||||||||||
| yield item | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def fix_html_content(content): | ||||||||||||
| if not content: | ||||||||||||
| return None | ||||||||||||
| try: | ||||||||||||
| content = content.decode("utf-8") | ||||||||||||
| logging.info("HTML content decoded as utf-8") | ||||||||||||
| except Exception as e: | ||||||||||||
| content = content.decode("iso-8859-1") | ||||||||||||
| logging.warning( | ||||||||||||
| f"HTML content decoded using iso-8859-1 due to utf-8 decoding error. {e}" | ||||||||||||
| ) | ||||||||||||
| logging.info("HTML content decoded as iso-8859-1") | ||||||||||||
|
Comment on lines
+76
to
+79
|
||||||||||||
| try: | ||||||||||||
| return HTMLContent(content).content | ||||||||||||
| except Exception as e: | ||||||||||||
|
|
@@ -104,13 +98,9 @@ def exceptions(self): | |||||||||||
|
|
||||||||||||
| @property | ||||||||||||
| def files(self): | ||||||||||||
| logging.info("xml") | ||||||||||||
| yield from self.bases_xml_files | ||||||||||||
| logging.info("bases_translation_files") | ||||||||||||
| yield from self.bases_translation_files | ||||||||||||
| logging.info("bases_pdf_files") | ||||||||||||
| yield from self.bases_pdf_files | ||||||||||||
| logging.info("htdocs_img_revistas_files") | ||||||||||||
| yield from self.htdocs_img_revistas_files | ||||||||||||
|
|
||||||||||||
| @property | ||||||||||||
|
|
@@ -136,9 +126,9 @@ def bases_translation_files(self): | |||||||||||
| pattern = os.path.join( | ||||||||||||
| self._classic_website_paths.bases_translation_path, | ||||||||||||
| self._subdir_acron_issue, | ||||||||||||
| "*.ht*", | ||||||||||||
| "**", | ||||||||||||
| "*.*", | ||||||||||||
|
||||||||||||
| "*.*", | |
| "*.ht*", |
Copilot
AI
Dec 22, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When fix_html_content() returns None (for empty content), the original item["content"] remains as raw bytes. This creates inconsistent content types in the output - some items will have processed string content while others have unprocessed bytes. This inconsistency could cause issues for consumers of this data. Consider always setting item["content"] to an appropriate value (e.g., empty string or None) to maintain type consistency.
| if content: | |
| if content is None: | |
| # Ensure item["content"] is always a text value, not raw bytes | |
| item["content"] = "" | |
| else: |
Copilot
AI
Dec 22, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The removal of the IndexError handler without replacing it with proper bounds checking creates a potential bug. The code assumes key[2] exists when checking key[2] == "_" on line 184, but if key has fewer than 3 characters, this will raise an IndexError that is no longer caught. This will cause the entire generator to fail instead of gracefully handling the invalid filename.
| if "_" in key and key[2] == "_": | |
| if len(key) >= 3 and key[2] == "_": |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The parameter
file_typeis passed tofixed_glob()but never used within that function. This parameter should either be removed from the function signature or utilized if it serves a purpose.