Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 29 additions & 47 deletions scielo_classic_website/models/issue_folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from datetime import datetime

from scielo_classic_website.htmlbody.html_body import HTMLContent
from scielo_classic_website.isisdb.isis_cmd import get_documents_by_issue_folder


def try_to_fix_encoding(nome_original):
Expand Down Expand Up @@ -41,48 +40,43 @@ def _get_classic_website_rel_path(file_path):


def fixed_glob(patterns, file_type, recursive):
paths = set()
for pattern in patterns:
for path in glob.glob(pattern, recursive=recursive):
try:
item = {
"type": file_type,
}
item["path"] = path
with open(path, "rb") as f:
item["content"] = f.read()
item["modified_date"] = modified_date(path)
except Exception as e:
logging.exception(e)
item["error"] = str(e)
item["error_type"] = type(e).__name__
yield item
paths.update(
glob.glob(pattern, recursive=recursive)
)
return paths
Comment on lines 42 to +48
Copy link

Copilot AI Dec 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parameter file_type is passed to fixed_glob() but never used within that function. This parameter should either be removed from the function signature or utilized if it serves a purpose.

Copilot uses AI. Check for mistakes.


def get_files(patterns, file_type, recursive=False):
for item in fixed_glob(patterns, file_type, recursive):
for path in fixed_glob(patterns, file_type, recursive):
try:
if not item.get("error"):
path = item["path"]
item["name"] = os.path.basename(path)
item["key"], item["extension"] = os.path.splitext(item["name"])
item["type"] = file_type
item["relative_path"] = _get_classic_website_rel_path(path)
yield item

item = {
"type": file_type,
"path": path,
"modified_date": modified_date(path),
"name": os.path.basename(path),
"relative_path": _get_classic_website_rel_path(path),
}
item["key"], item["extension"] = os.path.splitext(item["name"])
with open(path, "rb") as f:
item["content"] = f.read()
except Exception as e:
yield {"type": file_type, "error": str(e), "error_type": type(e).__name__}
logging.exception(e)
item["error"] = str(e)
item["error_type"] = type(e).__name__
Comment on lines 64 to +67
Copy link

Copilot AI Dec 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable item may not be defined if an exception occurs before line 54. If an exception happens during the modified_date(), os.path.basename(), or _get_classic_website_rel_path() calls (lines 57-59), the variable item will not exist when trying to set error attributes on lines 66-67, causing a NameError.

Copilot uses AI. Check for mistakes.
yield item


def fix_html_content(content):
if not content:
return None
try:
content = content.decode("utf-8")
logging.info("HTML content decoded as utf-8")
except Exception as e:
content = content.decode("iso-8859-1")
logging.warning(
f"HTML content decoded using iso-8859-1 due to utf-8 decoding error. {e}"
)
logging.info("HTML content decoded as iso-8859-1")
Comment on lines +76 to +79
Copy link

Copilot AI Dec 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logging at INFO level for every file's encoding detection will create excessive log noise in production. Consider using DEBUG level for these routine operations, or removing the logs entirely since encoding detection is expected behavior.

Copilot uses AI. Check for mistakes.
try:
return HTMLContent(content).content
except Exception as e:
Expand All @@ -104,13 +98,9 @@ def exceptions(self):

@property
def files(self):
logging.info("xml")
yield from self.bases_xml_files
logging.info("bases_translation_files")
yield from self.bases_translation_files
logging.info("bases_pdf_files")
yield from self.bases_pdf_files
logging.info("htdocs_img_revistas_files")
yield from self.htdocs_img_revistas_files

@property
Expand All @@ -136,9 +126,9 @@ def bases_translation_files(self):
pattern = os.path.join(
self._classic_website_paths.bases_translation_path,
self._subdir_acron_issue,
"*.ht*",
"**",
"*.*",
Copy link

Copilot AI Dec 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pattern change from ".ht" to "" + "." enables recursive searching, but will now match all file types in subdirectories, not just HTML files (.htm, .html). This could lead to processing non-HTML files as HTML content, which may cause errors. Consider using a more specific pattern like "/.ht" to maintain the original file type filtering while enabling recursive search.

Suggested change
"*.*",
"*.ht*",

Copilot uses AI. Check for mistakes.
)
logging.info(pattern)
for item in get_files([pattern], "html"):
if item.get("error"):
yield item
Expand All @@ -156,11 +146,8 @@ def bases_translation_files(self):
item["lang"] = lang
item["part"] = part
content = fix_html_content(item["content"])
if not content:
raise ValueError(
f"Conteúdo vazio no arquivo HTML {item['path']}"
)
item["content"] = content
if content:
Copy link

Copilot AI Dec 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When fix_html_content() returns None (for empty content), the original item["content"] remains as raw bytes. This creates inconsistent content types in the output - some items will have processed string content while others have unprocessed bytes. This inconsistency could cause issues for consumers of this data. Consider always setting item["content"] to an appropriate value (e.g., empty string or None) to maintain type consistency.

Suggested change
if content:
if content is None:
# Ensure item["content"] is always a text value, not raw bytes
item["content"] = ""
else:

Copilot uses AI. Check for mistakes.
item["content"] = content
yield item

except Exception as e:
Expand Down Expand Up @@ -192,17 +179,12 @@ def bases_pdf_files(self):
yield item
continue
try:
lang = None
item["lang"] = None
key = item["key"]
if "_" in key and key[2] == "_":
Copy link

Copilot AI Dec 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The removal of the IndexError handler without replacing it with proper bounds checking creates a potential bug. The code assumes key[2] exists when checking key[2] == "_" on line 184, but if key has fewer than 3 characters, this will raise an IndexError that is no longer caught. This will cause the entire generator to fail instead of gracefully handling the invalid filename.

Suggested change
if "_" in key and key[2] == "_":
if len(key) >= 3 and key[2] == "_":

Copilot uses AI. Check for mistakes.
# translations
lang = key[:2]
key = key[3:]
item["key"] = key
except IndexError as e:
continue
try:
item["lang"] = lang
item["key"] = key[3:]
item["lang"] = key[:2]
yield item
except Exception as e:
yield {"type": "pdf", "error": str(e), "error_type": type(e).__name__}
Expand Down