Merge pull request #13 from basnijholt/fix-datetime-schema-mismatch

basnijholt · web-flow · commit f5a284f7c519 · 2025-09-22T15:36:43.000-07:00
Fix datetime schema mismatch
diff --git a/slurm_usage.py b/slurm_usage.py
@@ -24,6 +24,8 @@
 import os
 import re
 import subprocess
+import types
+import typing
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timedelta, timezone
@@ -42,6 +44,8 @@
 from rich.table import Table
 
 UTC = timezone.utc
+# Preserve the original datetime class for type mapping even when patched in tests
+_DATETIME_TYPE = datetime
 
 app = typer.Typer(help="SLURM Job Monitor - Collect and analyze job efficiency metrics")
 console = Console()
@@ -672,6 +676,39 @@ def to_dict(self) -> dict[str, Any]:
         """Convert to dictionary for DataFrame creation."""
         return self.model_dump()
 
+    @classmethod
+    def get_polars_schema(cls) -> dict[str, pl.DataType]:
+        """Get Polars schema derived from Pydantic model fields."""
+        mapping: dict[type[Any], pl.DataType] = {
+            str: pl.Utf8,
+            int: pl.Int64,
+            float: pl.Float64,
+            bool: pl.Boolean,
+            # All datetime fields should be UTC
+            _DATETIME_TYPE: pl.Datetime("us", "UTC"),
+        }
+
+        schema: dict[str, pl.DataType] = {}
+        for field_name, field_info in cls.model_fields.items():
+            annotation = field_info.annotation
+
+            # Handle Optional types (Union[T, None] or T | None)
+            origin = typing.get_origin(annotation)
+            if origin in (typing.Union, types.UnionType):
+                args = typing.get_args(annotation)
+                non_none_args = [arg for arg in args if arg is not type(None)]
+                if non_none_args:
+                    annotation = non_none_args[0]
+
+            mapped_type: pl.DataType | None = None
+            if isinstance(annotation, type):
+                mapped_type = mapping.get(annotation)
+
+            # Map Python types to Polars types (default to Utf8 for unknown types)
+            schema[field_name] = mapped_type or pl.Utf8
+
+        return schema
+
 
 class DateCompletionTracker(BaseModel):
     """Tracks which dates have been fully processed and don't need re-collection."""
@@ -780,10 +817,16 @@ def _parse_datetime(date_str: str | None) -> datetime | None:
         return None
     try:
         # SLURM uses ISO format: 2025-08-19T10:30:00
-        return datetime.fromisoformat(date_str)
+        dt = datetime.fromisoformat(date_str)
     except (ValueError, AttributeError):
         return None
 
+    # Ensure timezone-aware (assume UTC if naive)
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=UTC)
+
+    return dt
+
 
 def _parse_gpu_count(alloc_tres: str) -> int:
     """Parse GPU count from AllocTRES string.
@@ -1330,10 +1373,14 @@ def _processed_jobs_to_dataframe(
         DataFrame with job data
 
     """
-    return pl.DataFrame(
-        [j.to_dict() for j in processed_jobs],
-        infer_schema_length=None,
-    )
+    # Create DataFrame with explicit schema to prevent Null type inference
+    schema = ProcessedJob.get_polars_schema()
+
+    if not processed_jobs:
+        return pl.DataFrame(schema=schema)
+
+    data_dicts = [j.to_dict() for j in processed_jobs]
+    return pl.DataFrame(data_dicts, schema=schema)
 
 
 def _save_processed_jobs_to_parquet(
diff --git a/tests/test_data_processing.py b/tests/test_data_processing.py
@@ -5,6 +5,8 @@
 import os
 import re
 import sys
+import tempfile
+from datetime import datetime, timezone
 from pathlib import Path
 
 import pytest
@@ -402,3 +404,74 @@ def test_parse_gres_multiple_sockets(self) -> None:
             cleaned_gres = re.sub(r"\(S:[0-9-]+\)", "", gres)
             gpu_parts = cleaned_gres.split(":")
             assert int(gpu_parts[-1]) == expected_count
+
+
+class TestDatetimeSchemaConsistency:
+    """Test datetime schema consistency when saving and loading data."""
+
+    def test_parse_datetime_returns_utc(self) -> None:
+        """Test that _parse_datetime returns UTC timezone-aware datetimes."""
+        # Test with ISO format string
+        dt = slurm_usage._parse_datetime("2025-09-20T10:30:00")
+        assert dt is not None
+        assert dt.tzinfo is not None
+        assert dt.tzinfo == timezone.utc
+
+        # Test with None
+        assert slurm_usage._parse_datetime(None) is None
+        assert slurm_usage._parse_datetime("Unknown") is None
+
+    def test_processed_jobs_to_dataframe(self) -> None:
+        """Test that processed jobs are correctly converted to DataFrame."""
+        # Create test ProcessedJob with datetime fields
+        from slurm_usage import ProcessedJob
+
+        job = ProcessedJob(
+            job_id="test123",
+            user="alice",
+            job_name="test_job",
+            partition="gpus",
+            state="COMPLETED",
+            submit_time=datetime(2025, 9, 20, 9, 0, 0, tzinfo=timezone.utc),
+            start_time=datetime(2025, 9, 20, 10, 0, 0, tzinfo=timezone.utc),
+            end_time=datetime(2025, 9, 20, 11, 0, 0, tzinfo=timezone.utc),
+            node_list="node-001",
+            elapsed_seconds=3600,
+            alloc_cpus=4,
+            req_mem_mb=4096,
+            max_rss_mb=2048,
+            total_cpu_seconds=7200,
+            alloc_gpus=1,
+            cpu_efficiency=50.0,
+            memory_efficiency=50.0,
+            cpu_hours_wasted=1.0,
+            memory_gb_hours_wasted=2.0,
+            cpu_hours_reserved=2.0,
+            memory_gb_hours_reserved=4.0,
+            gpu_hours_reserved=1.0,
+            is_complete=True,
+        )
+
+        # Convert to DataFrame
+        df = slurm_usage._processed_jobs_to_dataframe([job])
+
+        # Check DataFrame was created correctly
+        assert len(df) == 1
+        assert df["job_id"][0] == "test123"
+        assert df["user"][0] == "alice"
+
+    def test_load_recent_data_handles_empty_directory(self) -> None:
+        """Test that _load_recent_data handles empty directory gracefully."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config = slurm_usage.Config(
+                data_dir=Path(tmpdir),
+                groups={},
+                user_to_group={},
+            )
+
+            processed_dir = Path(tmpdir) / "processed"
+            processed_dir.mkdir(parents=True, exist_ok=True)
+
+            # Should return None for empty directory
+            result = slurm_usage._load_recent_data(config, days=1)
+            assert result is None