Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions tools/add_metadata/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ Dry run:
python add_metadata.py --dry-run --endpoint="http://dev-5.pc:86/server/api/" --to_mtd_field dc.date.issued --from_mtd_field dc.date.submitted dc.date.committed dc.date.defense dc.date
```

## TUL fix date format in dc.date.issued

```
set ENVFILE=.env-tul
python add_metadata.py --fix-date-format --endpoint="https://dspace.tul.cz/server/api/" --dry-run
```

## TUL update dc.date.issued

```
Expand Down
177 changes: 132 additions & 45 deletions tools/add_metadata/add_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,38 +72,78 @@ def input(self) -> str:
def value(self) -> str:
return self._d

def is_valid(self):
"""Check if the given string is a valid date."""
@staticmethod
def is_year_only(date_str: str) -> bool:
"""Check if the string represents a year-only format (YYYY)."""
if len(date_str) != 4 or not date_str.isdigit():
return False
try:
datetime.strptime(self._d, '%Y-%m-%d')
datetime.strptime(date_str, '%Y')
return True
except ValueError as e:
date.invalid[self._d] += 1
if date.invalid[self._d] == 1:
_logger.debug(f"[{self._d}] is not valid date. Error: {e}")
except ValueError:
return False

def parse(self) -> bool:
"""Convert the value to a date format. Normalize date to 'YYYY-MM-DD' format, filling missing parts with '01'."""
def is_valid_hybrid(self):
"""Check if date is valid in YYYY-MM-DD, YYYY-MM, or YYYY format (all kept as-is)."""
# Check YYYY-MM-DD format
try:
datetime.strptime(self._d, '%Y-%m-%d')
return True
except ValueError:
pass

Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace on line. This should be removed to maintain code cleanliness.

Suggested change

Copilot uses AI. Check for mistakes.
# Check YYYY-MM format (partial date)
try:
datetime.strptime(self._d, '%Y-%m')
return True
except ValueError:
pass

Comment on lines +100 to +101
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace on lines. This should be removed to maintain code cleanliness.

Copilot uses AI. Check for mistakes.
# Check YYYY format (year only)
if date.is_year_only(self._d):
return True

Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace on line. This should be removed to maintain code cleanliness.

Suggested change

Copilot uses AI. Check for mistakes.
date.invalid[self._d] += 1
if date.invalid[self._d] == 1:
_logger.debug(f"[{self._d}] is not valid date format (expected YYYY-MM-DD, YYYY-MM, or YYYY)")
return False

def parse_hybrid(self) -> bool:
"""Convert date with hybrid rules:
- Keep YYYY format as-is (year only)
- Keep YYYY-MM format as-is (partial date, but normalize separators)
- Convert full dates to YYYY-MM-DD format
"""
if len(self._d) < 1:
return False

formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y',
'%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y', '%d. %m. %Y']
for fmt in formats:
# Check if it's already year-only format (YYYY) - keep as-is
if date.is_year_only(self._d):
return True

# Try full date formats (with day, month, and year)
full_date_formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y',
'%Y-%m-%d', '%d-%m-%Y', '%d. %m. %Y']
for fmt in full_date_formats:
try:
datetime_obj = datetime.strptime(self._d, fmt)
# Normalize date to 'YYYY-MM-DD'
if fmt in ['%Y-%m', '%Y/%m', '%Y.%m', '%m-%Y', "%m/%Y", "%m.%Y"]:
self._d = datetime_obj.strftime('%Y-%m-01')
elif fmt == '%Y':
self._d = datetime_obj.strftime('%Y-01-01')
else:
self._d = datetime_obj.strftime('%Y-%m-%d')
# Normalize to 'YYYY-MM-DD'
self._d = datetime_obj.strftime('%Y-%m-%d')
return True
except ValueError:
continue

# Try partial date formats (year-month) - normalize to YYYY-MM
partial_formats = ['%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y']
for fmt in partial_formats:
try:
datetime_obj = datetime.strptime(self._d, fmt)
# Normalize to 'YYYY-MM' (keep as partial date)
self._d = datetime_obj.strftime('%Y-%m')
Comment on lines +136 to +142
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The partial date formats handling has inconsistent behavior. When parsing partial dates (YYYY-MM), the code converts them to YYYY-MM format (line 142), but the PR description states that partial dates should be "converted to year-only format" (e.g., 5/20252025). This contradicts the documented behavior.

Copilot uses AI. Check for mistakes.
return True
except ValueError:
# The test format does not match the input date format
continue

Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace on line. This should be removed to maintain code cleanliness.

Suggested change

Copilot uses AI. Check for mistakes.
_logger.warning(f"Error converting [{self._d}] to date.")
return False

Expand Down Expand Up @@ -136,9 +176,11 @@ def __init__(self, dspace_be, from_mtd_fields: list, to_mtd_field: list, dry_run
self._dry_run = dry_run
self._info = {
"valid": [],
"valid_year_only": [],
"multiple": set(),
"invalid_date": [],
"invalid_date_all": set(),
"anomalies": [],
"updated": [],
"error_updating": [],
"error_creating": [],
Expand Down Expand Up @@ -169,36 +211,22 @@ def find_correct_metadata(self, item: dict):
# If there is more than one value, get only the first one
meta_val = date(meta_key[0]["value"])
# Convert date if necessary
if not meta_val.is_valid():
if not meta_val.parse():
if not meta_val.is_valid_hybrid():
if not meta_val.parse_hybrid():
self._info["invalid_date_all"].add(meta_val.input)
continue
return meta_val, id_str

return None, None

def update_existing_metadata(self, item: dict, date_str: str, force: bool = False) -> int:
uuid = item['uuid']
def _perform_update(self, item: dict, date_val: date, uuid: str, id_str: str) -> int:
"""Common logic for updating item metadata in database."""
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing docstring for the _perform_update method. Since this is a newly extracted helper method containing important update logic, it should have proper documentation explaining its parameters, return value, and behavior.

Suggested change
"""Common logic for updating item metadata in database."""
"""
Updates the metadata of an item in the database with a converted date value.
Parameters:
item (dict): The item dictionary containing metadata to be updated.
date_val (date): The date object representing the converted date value.
uuid (str): The unique identifier of the item.
id_str (str): A string used for logging, typically identifying the item and field.
Returns:
int: Status code indicating the result of the update operation:
updater.ret_failed (1): Update failed.
updater.ret_updated (2): Update succeeded.
Behavior:
- Logs the conversion of invalid dates.
- Updates the item's metadata with the converted date value.
- Attempts to update the item in the database (unless dry_run is True).
- Updates internal info structures to track update status.
"""

Copilot uses AI. Check for mistakes.
item_mtd = item["metadata"]

id_str = f"Item [{uuid}]: [{self._to_mtd_field}]"
# If there is more than one value, get only the first one
date_val = date(date_str)
if not force:
if date_val.is_valid():
self._info["valid"].append((uuid, date_val.input))
return updater.ret_already_ok

parsed_ok = date_val.parse()
if parsed_ok is False:
_logger.error(f"{id_str}: cannot convert [{date_val.input}] to date")
self._info["invalid_date"].append((uuid, date_val.input))
return updater.ret_invalid_meta

# Convert date to correct format if necessary

Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace on line. This should be removed to maintain code cleanliness.

Suggested change

Copilot uses AI. Check for mistakes.
# Log conversion
date.invalid_but_converted[date_val.input] += 1
if date.invalid_but_converted[date_val.input] == 1:
_logger.info(f"{id_str}: invalid date [{date_val.input}] converted")
_logger.info(f"{id_str}: invalid date [{date_val.input}] converted to [{date_val.value}]")

# Update the item metadata with the converted date
item_mtd[self._to_mtd_field][0]["value"] = date_val.value
Expand All @@ -214,6 +242,40 @@ def update_existing_metadata(self, item: dict, date_str: str, force: bool = Fals
self._info["updated"].append((uuid, date_val.input))
return updater.ret_updated

def update_existing_metadata(self, item: dict, date_str: str, force: bool = False) -> int:
"""Update existing metadata with hybrid rules:
- No null/empty handling (will crash on None)
- YYYY formats kept as-is
- Partial dates (YYYY-MM) kept as-is (normalized)
- Invalid dates logged as ANOMALY
"""
Comment on lines +246 to +251
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incomplete docstring. The docstring mentions "No null/empty handling (will crash on None)" but doesn't explain the force parameter, the return value, or other important aspects of the method's behavior.

Suggested change
"""Update existing metadata with hybrid rules:
- No null/empty handling (will crash on None)
- YYYY formats kept as-is
- Partial dates (YYYY-MM) kept as-is (normalized)
- Invalid dates logged as ANOMALY
"""
"""
Update existing metadata for an item using hybrid date rules.
Args:
item (dict): The item whose metadata is to be updated.
date_str (str): The date string to validate and update.
force (bool, optional): If True, forces the update even if the date is invalid or not in a preferred format.
If False, only updates if the date is not already valid or in a preferred format.
Behavior:
- No null/empty handling (will crash on None).
- Dates in YYYY format are kept as-is.
- Partial dates (YYYY-MM) are kept as-is (normalized).
- Invalid dates are logged as ANOMALY and not updated unless force=True.
- If force=True, attempts to update the metadata regardless of validity.
Returns:
int: Status code indicating the result:
- updater.ret_already_ok: Metadata was already valid and no update was performed.
- updater.ret_invalid_meta: The date was invalid and could not be updated.
- updater.ret_updated: Metadata was successfully updated.
- updater.ret_failed: An error occurred during the update.
"""

Copilot uses AI. Check for mistakes.
uuid = item['uuid']
id_str = f"Item [{uuid}]: [{self._to_mtd_field}]"
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace on line. This should be removed to maintain code cleanliness.

Copilot uses AI. Check for mistakes.

# No null/empty handling - let it crash if needed
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment "No null/empty handling - let it crash if needed" indicates intentional lack of defensive programming. However, this could lead to unclear error messages when None or empty values are encountered. Consider at least validating that date_str is not None/empty and providing a clear error message if it is, rather than allowing a generic crash.

Copilot uses AI. Check for mistakes.
date_val = date(date_str)
if not force:
if date_val.is_valid_hybrid():
# Check if it's year-only or partial format
if date.is_year_only(date_str):
self._info["valid_year_only"].append((uuid, date_val.input))
_logger.info(f"{id_str}: year-only format [{date_str}] - keeping as-is")
elif len(date_str) == 7 and date_str[4] == '-': # YYYY-MM format
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded check for YYYY-MM format is fragile. This checks for exactly 7 characters with a dash at position 4, which could match invalid dates like "abcd-ef". Consider using a regex pattern or the datetime parsing logic already available to validate the format more robustly.

Copilot uses AI. Check for mistakes.
self._info["valid"].append((uuid, date_val.input))
_logger.info(f"{id_str}: partial date format [{date_str}] - keeping as-is")
else:
self._info["valid"].append((uuid, date_val.input))
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent logging behavior. Year-only and partial date formats log info messages (lines 262, 265), but valid full dates in YYYY-MM-DD format do not (line 267). This creates inconsistent verbosity where some "valid" cases are logged and others are silent, making debugging and monitoring more difficult.

Suggested change
self._info["valid"].append((uuid, date_val.input))
self._info["valid"].append((uuid, date_val.input))
_logger.info(f"{id_str}: full date format [{date_str}] - keeping as-is")

Copilot uses AI. Check for mistakes.
return updater.ret_already_ok

parsed_ok = date_val.parse_hybrid()
if parsed_ok is False:
_logger.error(f"{id_str}: cannot convert [{date_val.input}] to date - ANOMALY")
self._info["invalid_date"].append((uuid, date_val.input))
self._info["anomalies"].append((uuid, date_val.input, "Cannot parse date format"))
return updater.ret_invalid_meta

return self._perform_update(item, date_val, uuid, id_str)

def add_new_metadata(self, item) -> int:
uuid = item['uuid']

Expand Down Expand Up @@ -250,7 +312,7 @@ def update(self, item: dict, force: bool = False) -> int:
for i in range(len(date_meta)):
if len(val) == 0:
date_val = date(date_meta[i]["value"])
if date_val.is_valid() or date_val.parse():
if date_val.is_valid_hybrid() or date_val.parse_hybrid():
val = date_val.value
continue
if val == '' and i == len(date_meta) - 1:
Expand All @@ -275,6 +337,7 @@ def update(self, item: dict, force: bool = False) -> int:
f"Forced metadata change but no value found for [{uuid}]")
return updater.ret_empty_meta

# Always use standard validation/parsing (no relaxed mode)
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment "Always use standard validation/parsing (no relaxed mode)" is unclear and adds no value. Either expand it to explain what it means and why it's important, or remove it.

Suggested change
# Always use standard validation/parsing (no relaxed mode)
# Use strict validation/parsing for metadata values to ensure data integrity.
# Relaxed parsing is intentionally avoided to prevent accepting malformed or ambiguous data.

Copilot uses AI. Check for mistakes.
return self.update_existing_metadata(item, val, force=force)
else:
return self.add_new_metadata(item)
Expand Down Expand Up @@ -312,17 +375,31 @@ def print_info(self, show_limit=100):
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Add metadata for DSpace items")
parser.add_argument("--to_mtd_field",
type=str, required=True, help="Metadata field to be created.")
type=str, required=False, help="Metadata field to be created or updated (required unless --fix-date-format is used).")
parser.add_argument("--from_mtd_field",
type=str, nargs='+', required=True,
type=str, nargs='+', required=False,
help="Metadata field(s) from which value(s) can be used.")
parser.add_argument("--fix-date-format", action='store_true', default=False,
help="Fix date format in dc.date.issued field (no other parameters needed)")
parser.add_argument("--endpoint", type=str, default=env["backend"]["endpoint"])
parser.add_argument("--user", type=str, default=env["backend"]["user"])
parser.add_argument("--password", type=str, default=env["backend"]["password"])
parser.add_argument("--dry-run", action='store_true', default=False)
parser.add_argument("--result-every-N", type=int, default=10000)
parser.add_argument("--only", type=str, default=None)
args = parser.parse_args()

# Handle fix-date-format mode
if args.fix_date_format:
args.to_mtd_field = "dc.date.issued"
args.from_mtd_field = ["dc.date.issued"]
_logger.info("Fix date format mode enabled: correcting dc.date.issued")

# Validate required arguments for non-fix-date-format mode
if not args.fix_date_format:
if args.to_mtd_field is None or args.from_mtd_field is None:
parser.error("--to_mtd_field and --from_mtd_field are required unless --fix-date-format is used")

# output args from parse_args but without passwords
args_dict = vars(args).copy()
args_dict.pop("password", None)
Expand All @@ -341,7 +418,8 @@ def print_info(self, show_limit=100):
# Initialize DSpace backend
dspace_be = dspace.rest(endpoint, user, password, True)

upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, dry_run=args.dry_run)
upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field,
dry_run=args.dry_run)

stats = additional_stats()

Expand Down Expand Up @@ -436,6 +514,15 @@ def print_info(self, show_limit=100):
for k, v in upd.info.items():
_logger.info(f"{k:20s}:{len(v):6d}: first {limit} items .. {list(v)[:limit]}...")

_logger.info(40 * "=")
_logger.info("Anomalies found:")
if len(upd.info["anomalies"]) > 0:
_logger.warning(f"Total anomalies: {len(upd.info['anomalies'])}")
for uuid, value, reason in upd.info["anomalies"][:100]: # Show first 100
_logger.warning(f" Item [{uuid}]: value=[{value}] - {reason}")
else:
_logger.info("No anomalies found")

_logger.info(40 * "=")
_logger.info("Date info")
msgs = "\n\t".join(upd.cannot_parse)
Expand Down
Loading