dataquest-dev · jr-rk · Nov 3, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
diff --git a/tools/add_metadata/README.md b/tools/add_metadata/README.md
@@ -9,6 +9,13 @@ Dry run:
 python add_metadata.py --dry-run --endpoint="http://dev-5.pc:86/server/api/" --to_mtd_field dc.date.issued --from_mtd_field dc.date.submitted dc.date.committed dc.date.defense dc.date
 ```
 
+## TUL fix date format in dc.date.issued
+
+```
+set ENVFILE=.env-tul
+python add_metadata.py --fix-date-format --endpoint="https://dspace.tul.cz/server/api/" --dry-run
+```
+
 ## TUL update dc.date.issued
 
 ```

diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py
@@ -72,38 +72,78 @@ def input(self) -> str:
     def value(self) -> str:
         return self._d
 
-    def is_valid(self):
-        """Check if the given string is a valid date."""
+    @staticmethod
+    def is_year_only(date_str: str) -> bool:
+        """Check if the string represents a year-only format (YYYY)."""
+        if len(date_str) != 4 or not date_str.isdigit():
+            return False
         try:
-            datetime.strptime(self._d, '%Y-%m-%d')
+            datetime.strptime(date_str, '%Y')
             return True
-        except ValueError as e:
-            date.invalid[self._d] += 1
-            if date.invalid[self._d] == 1:
-                _logger.debug(f"[{self._d}] is not valid date. Error: {e}")
+        except ValueError:
             return False
 
-    def parse(self) -> bool:
-        """Convert the value to a date format. Normalize date to 'YYYY-MM-DD' format, filling missing parts with '01'."""
+    def is_valid_hybrid(self):
+        """Check if date is valid in YYYY-MM-DD, YYYY-MM, or YYYY format (all kept as-is)."""
+        # Check YYYY-MM-DD format
+        try:
+            datetime.strptime(self._d, '%Y-%m-%d')
+            return True
+        except ValueError:
+            pass
+
-        
-        
+        # Check YYYY-MM format (partial date)
+        try:
+            datetime.strptime(self._d, '%Y-%m')
+            return True
+        except ValueError:
+            pass
+
+        # Check YYYY format (year only)
+        if date.is_year_only(self._d):
+            return True
+
-        
-        
+        date.invalid[self._d] += 1
+        if date.invalid[self._d] == 1:
+            _logger.debug(f"[{self._d}] is not valid date format (expected YYYY-MM-DD, YYYY-MM, or YYYY)")
+        return False
+
+    def parse_hybrid(self) -> bool:
+        """Convert date with hybrid rules:
+        - Keep YYYY format as-is (year only)
+        - Keep YYYY-MM format as-is (partial date, but normalize separators)
+        - Convert full dates to YYYY-MM-DD format
+        """
         if len(self._d) < 1:
             return False
 
-        formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y',
-                   '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y', '%d. %m. %Y']
-        for fmt in formats:
+        # Check if it's already year-only format (YYYY) - keep as-is
+        if date.is_year_only(self._d):
+            return True
+
+        # Try full date formats (with day, month, and year)
+        full_date_formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', 
+                             '%Y-%m-%d', '%d-%m-%Y', '%d. %m. %Y']
+        for fmt in full_date_formats:
             try:
                 datetime_obj = datetime.strptime(self._d, fmt)
-                # Normalize date to 'YYYY-MM-DD'
-                if fmt in ['%Y-%m', '%Y/%m', '%Y.%m', '%m-%Y', "%m/%Y", "%m.%Y"]:
-                    self._d = datetime_obj.strftime('%Y-%m-01')
-                elif fmt == '%Y':
-                    self._d = datetime_obj.strftime('%Y-01-01')
-                else:
-                    self._d = datetime_obj.strftime('%Y-%m-%d')
+                # Normalize to 'YYYY-MM-DD'
+                self._d = datetime_obj.strftime('%Y-%m-%d')
+                return True
+            except ValueError:
+                continue
+
+        # Try partial date formats (year-month) - normalize to YYYY-MM
+        partial_formats = ['%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y']
+        for fmt in partial_formats:
+            try:
+                datetime_obj = datetime.strptime(self._d, fmt)
+                # Normalize to 'YYYY-MM' (keep as partial date)
+                self._d = datetime_obj.strftime('%Y-%m')
                 return True
             except ValueError:
-                # The test format does not match the input date format
                 continue
+
-        
-        
         _logger.warning(f"Error converting [{self._d}] to date.")
         return False
 
@@ -136,9 +176,11 @@ def __init__(self, dspace_be, from_mtd_fields: list, to_mtd_field: list, dry_run
         self._dry_run = dry_run
         self._info = {
             "valid": [],
+            "valid_year_only": [],
             "multiple": set(),
             "invalid_date": [],
             "invalid_date_all": set(),
+            "anomalies": [],
             "updated": [],
             "error_updating": [],
             "error_creating": [],
@@ -169,36 +211,22 @@ def find_correct_metadata(self, item: dict):
             # If there is more than one value, get only the first one
             meta_val = date(meta_key[0]["value"])
             # Convert date if necessary
-            if not meta_val.is_valid():
-                if not meta_val.parse():
+            if not meta_val.is_valid_hybrid():
+                if not meta_val.parse_hybrid():
                     self._info["invalid_date_all"].add(meta_val.input)
                     continue
             return meta_val, id_str
 
         return None, None
 
-    def update_existing_metadata(self, item: dict, date_str: str, force: bool = False) -> int:
-        uuid = item['uuid']
+    def _perform_update(self, item: dict, date_val: date, uuid: str, id_str: str) -> int:
+        """Common logic for updating item metadata in database."""
-        """Common logic for updating item metadata in database."""
+        """
+        Updates the metadata of an item in the database with a converted date value.
+
+        Parameters:
+            item (dict): The item dictionary containing metadata to be updated.
+            date_val (date): The date object representing the converted date value.
+            uuid (str): The unique identifier of the item.
+            id_str (str): A string used for logging, typically identifying the item and field.
+
+        Returns:
+            int: Status code indicating the result of the update operation:
+                updater.ret_failed (1): Update failed.
+                updater.ret_updated (2): Update succeeded.
+
+        Behavior:
+            - Logs the conversion of invalid dates.
+            - Updates the item's metadata with the converted date value.
+            - Attempts to update the item in the database (unless dry_run is True).
+            - Updates internal info structures to track update status.
+        """
-        """Common logic for updating item metadata in database."""
+        """
+        Updates the metadata of an item in the database with a converted date value.
+
+        Parameters:
+            item (dict): The item dictionary containing metadata to be updated.
+            date_val (date): The date object representing the converted date value.
+            uuid (str): The unique identifier of the item.
+            id_str (str): A string used for logging, typically identifying the item and field.
+
+        Returns:
+            int: Status code indicating the result of the update operation:
+                updater.ret_failed (1): Update failed.
+                updater.ret_updated (2): Update succeeded.
+
+        Behavior:
+            - Logs the conversion of invalid dates.
+            - Updates the item's metadata with the converted date value.
+            - Attempts to update the item in the database (unless dry_run is True).
+            - Updates internal info structures to track update status.
+        """
         item_mtd = item["metadata"]
-
-        id_str = f"Item [{uuid}]: [{self._to_mtd_field}]"
-        # If there is more than one value, get only the first one
-        date_val = date(date_str)
-        if not force:
-            if date_val.is_valid():
-                self._info["valid"].append((uuid, date_val.input))
-                return updater.ret_already_ok
-
-            parsed_ok = date_val.parse()
-            if parsed_ok is False:
-                _logger.error(f"{id_str}: cannot convert [{date_val.input}] to date")
-                self._info["invalid_date"].append((uuid, date_val.input))
-                return updater.ret_invalid_meta
-
-        # Convert date to correct format if necessary
+
-        
-        
+        # Log conversion
         date.invalid_but_converted[date_val.input] += 1
         if date.invalid_but_converted[date_val.input] == 1:
-            _logger.info(f"{id_str}: invalid date [{date_val.input}] converted")
+            _logger.info(f"{id_str}: invalid date [{date_val.input}] converted to [{date_val.value}]")
 
         # Update the item metadata with the converted date
         item_mtd[self._to_mtd_field][0]["value"] = date_val.value
@@ -214,6 +242,40 @@ def update_existing_metadata(self, item: dict, date_str: str, force: bool = Fals
         self._info["updated"].append((uuid, date_val.input))
         return updater.ret_updated
 
+    def update_existing_metadata(self, item: dict, date_str: str, force: bool = False) -> int:
+        """Update existing metadata with hybrid rules:
+        - No null/empty handling (will crash on None)
+        - YYYY formats kept as-is
+        - Partial dates (YYYY-MM) kept as-is (normalized)
+        - Invalid dates logged as ANOMALY
+        """
-        """Update existing metadata with hybrid rules:
-        - No null/empty handling (will crash on None)
-        - YYYY formats kept as-is
-        - Partial dates (YYYY-MM) kept as-is (normalized)
-        - Invalid dates logged as ANOMALY
-        """
+        """
+        Update existing metadata for an item using hybrid date rules.
+
+        Args:
+            item (dict): The item whose metadata is to be updated.
+            date_str (str): The date string to validate and update.
+            force (bool, optional): If True, forces the update even if the date is invalid or not in a preferred format.
+                If False, only updates if the date is not already valid or in a preferred format.
+
+        Behavior:
+            - No null/empty handling (will crash on None).
+            - Dates in YYYY format are kept as-is.
+            - Partial dates (YYYY-MM) are kept as-is (normalized).
+            - Invalid dates are logged as ANOMALY and not updated unless force=True.
+            - If force=True, attempts to update the metadata regardless of validity.
+
+        Returns:
+            int: Status code indicating the result:
+                - updater.ret_already_ok: Metadata was already valid and no update was performed.
+                - updater.ret_invalid_meta: The date was invalid and could not be updated.
+                - updater.ret_updated: Metadata was successfully updated.
+                - updater.ret_failed: An error occurred during the update.
+        """
-        """Update existing metadata with hybrid rules:
-        - No null/empty handling (will crash on None)
-        - YYYY formats kept as-is
-        - Partial dates (YYYY-MM) kept as-is (normalized)
-        - Invalid dates logged as ANOMALY
-        """
+        """
+        Update existing metadata for an item using hybrid date rules.
+
+        Args:
+            item (dict): The item whose metadata is to be updated.
+            date_str (str): The date string to validate and update.
+            force (bool, optional): If True, forces the update even if the date is invalid or not in a preferred format.
+                If False, only updates if the date is not already valid or in a preferred format.
+
+        Behavior:
+            - No null/empty handling (will crash on None).
+            - Dates in YYYY format are kept as-is.
+            - Partial dates (YYYY-MM) are kept as-is (normalized).
+            - Invalid dates are logged as ANOMALY and not updated unless force=True.
+            - If force=True, attempts to update the metadata regardless of validity.
+
+        Returns:
+            int: Status code indicating the result:
+                - updater.ret_already_ok: Metadata was already valid and no update was performed.
+                - updater.ret_invalid_meta: The date was invalid and could not be updated.
+                - updater.ret_updated: Metadata was successfully updated.
+                - updater.ret_failed: An error occurred during the update.
+        """
+        uuid = item['uuid']
+        id_str = f"Item [{uuid}]: [{self._to_mtd_field}]"
+
+        # No null/empty handling - let it crash if needed
+        date_val = date(date_str)
+        if not force:
+            if date_val.is_valid_hybrid():
+                # Check if it's year-only or partial format
+                if date.is_year_only(date_str):
+                    self._info["valid_year_only"].append((uuid, date_val.input))
+                    _logger.info(f"{id_str}: year-only format [{date_str}] - keeping as-is")
+                elif len(date_str) == 7 and date_str[4] == '-':  # YYYY-MM format
+                    self._info["valid"].append((uuid, date_val.input))
+                    _logger.info(f"{id_str}: partial date format [{date_str}] - keeping as-is")
+                else:
+                    self._info["valid"].append((uuid, date_val.input))
-                    self._info["valid"].append((uuid, date_val.input))
+                    self._info["valid"].append((uuid, date_val.input))
+                    _logger.info(f"{id_str}: full date format [{date_str}] - keeping as-is")
-                    self._info["valid"].append((uuid, date_val.input))
+                    self._info["valid"].append((uuid, date_val.input))
+                    _logger.info(f"{id_str}: full date format [{date_str}] - keeping as-is")
+                return updater.ret_already_ok
+
+            parsed_ok = date_val.parse_hybrid()
+            if parsed_ok is False:
+                _logger.error(f"{id_str}: cannot convert [{date_val.input}] to date - ANOMALY")
+                self._info["invalid_date"].append((uuid, date_val.input))
+                self._info["anomalies"].append((uuid, date_val.input, "Cannot parse date format"))
+                return updater.ret_invalid_meta
+
+        return self._perform_update(item, date_val, uuid, id_str)
+
     def add_new_metadata(self, item) -> int:
         uuid = item['uuid']
 
@@ -250,7 +312,7 @@ def update(self, item: dict, force: bool = False) -> int:
                     for i in range(len(date_meta)):
                         if len(val) == 0:
                             date_val = date(date_meta[i]["value"])
-                            if date_val.is_valid() or date_val.parse():
+                            if date_val.is_valid_hybrid() or date_val.parse_hybrid():
                                 val = date_val.value
                                 continue
                         if val == '' and i == len(date_meta) - 1:
@@ -275,6 +337,7 @@ def update(self, item: dict, force: bool = False) -> int:
                         f"Forced metadata change but no value found for [{uuid}]")
                     return updater.ret_empty_meta
 
+            # Always use standard validation/parsing (no relaxed mode)
-            # Always use standard validation/parsing (no relaxed mode)
+            # Use strict validation/parsing for metadata values to ensure data integrity.
+            # Relaxed parsing is intentionally avoided to prevent accepting malformed or ambiguous data.
-            # Always use standard validation/parsing (no relaxed mode)
+            # Use strict validation/parsing for metadata values to ensure data integrity.
+            # Relaxed parsing is intentionally avoided to prevent accepting malformed or ambiguous data.
             return self.update_existing_metadata(item, val, force=force)
         else:
             return self.add_new_metadata(item)
@@ -312,17 +375,31 @@ def print_info(self, show_limit=100):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Add metadata for DSpace items")
     parser.add_argument("--to_mtd_field",
-                        type=str, required=True, help="Metadata field to be created.")
+                        type=str, required=False, help="Metadata field to be created or updated (required unless --fix-date-format is used).")
     parser.add_argument("--from_mtd_field",
-                        type=str, nargs='+', required=True,
+                        type=str, nargs='+', required=False,
                         help="Metadata field(s) from which value(s) can be used.")
+    parser.add_argument("--fix-date-format", action='store_true', default=False,
+                        help="Fix date format in dc.date.issued field (no other parameters needed)")
     parser.add_argument("--endpoint", type=str, default=env["backend"]["endpoint"])
     parser.add_argument("--user", type=str, default=env["backend"]["user"])
     parser.add_argument("--password", type=str, default=env["backend"]["password"])
     parser.add_argument("--dry-run", action='store_true', default=False)
     parser.add_argument("--result-every-N", type=int, default=10000)
     parser.add_argument("--only", type=str, default=None)
     args = parser.parse_args()
+
+    # Handle fix-date-format mode
+    if args.fix_date_format:
+        args.to_mtd_field = "dc.date.issued"
+        args.from_mtd_field = ["dc.date.issued"]
+        _logger.info("Fix date format mode enabled: correcting dc.date.issued")
+
+    # Validate required arguments for non-fix-date-format mode
+    if not args.fix_date_format:
+        if args.to_mtd_field is None or args.from_mtd_field is None:
+            parser.error("--to_mtd_field and --from_mtd_field are required unless --fix-date-format is used")
+
     # output args from parse_args but without passwords
     args_dict = vars(args).copy()
     args_dict.pop("password", None)
@@ -341,7 +418,8 @@ def print_info(self, show_limit=100):
     # Initialize DSpace backend
     dspace_be = dspace.rest(endpoint, user, password, True)
 
-    upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, dry_run=args.dry_run)
+    upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, 
+                  dry_run=args.dry_run)
 
     stats = additional_stats()
 
@@ -436,6 +514,15 @@ def print_info(self, show_limit=100):
     for k, v in upd.info.items():
         _logger.info(f"{k:20s}:{len(v):6d}: first {limit} items .. {list(v)[:limit]}...")
 
+    _logger.info(40 * "=")
+    _logger.info("Anomalies found:")
+    if len(upd.info["anomalies"]) > 0:
+        _logger.warning(f"Total anomalies: {len(upd.info['anomalies'])}")
+        for uuid, value, reason in upd.info["anomalies"][:100]:  # Show first 100
+            _logger.warning(f"  Item [{uuid}]: value=[{value}] - {reason}")
+    else:
+        _logger.info("No anomalies found")
+
     _logger.info(40 * "=")
     _logger.info("Date info")
     msgs = "\n\t".join(upd.cannot_parse)