petab1->2: create experiment df

dweindl · dweindl · commit d2715e96fb23 · 2024-12-16T22:32:56.000+01:00
diff --git a/petab/v2/C.py b/petab/v2/C.py
@@ -13,14 +13,6 @@
 #: Experiment ID column in the measurement table
 EXPERIMENT_ID = "experimentId"
 
-# TODO: remove
-#: Preequilibration condition ID column in the measurement table
-PREEQUILIBRATION_CONDITION_ID = "preequilibrationConditionId"
-
-# TODO: remove
-#: Simulation condition ID column in the measurement table
-SIMULATION_CONDITION_ID = "simulationConditionId"
-
 #: Measurement value column in the measurement table
 MEASUREMENT = "measurement"
 
@@ -45,17 +37,13 @@
 #: Mandatory columns of measurement table
 MEASUREMENT_DF_REQUIRED_COLS = [
     OBSERVABLE_ID,
-    # TODO: add
-    # EXPERIMENT_ID,
-    SIMULATION_CONDITION_ID,
+    EXPERIMENT_ID,
     MEASUREMENT,
     TIME,
 ]
 
 #: Optional columns of measurement table
 MEASUREMENT_DF_OPTIONAL_COLS = [
-    # TODO: remove
-    PREEQUILIBRATION_CONDITION_ID,
     OBSERVABLE_PARAMETERS,
     NOISE_PARAMETERS,
     DATASET_ID,
diff --git a/petab/v2/__init__.py b/petab/v2/__init__.py
@@ -27,7 +27,10 @@
 
 # import after v1
 from ..version import __version__  # noqa: F401, E402
-from . import models  # noqa: F401, E402
+from . import (  # noqa: F401, E402
+    C,  # noqa: F401, E402
+    models,  # noqa: F401, E402
+)
 from .conditions import *  # noqa: F403, F401, E402
 from .experiments import (  # noqa: F401, E402
     get_experiment_df,
diff --git a/petab/v2/lint.py b/petab/v2/lint.py
@@ -15,6 +15,9 @@
 from .. import v2
 from ..v1.lint import (
     _check_df,
+    assert_measured_observables_defined,
+    assert_measurements_not_null,
+    assert_measurements_numeric,
     assert_model_parameters_in_condition_or_parameter_table,
     assert_no_leading_trailing_whitespace,
     assert_parameter_bounds_are_numeric,
@@ -23,13 +26,16 @@
     assert_parameter_prior_parameters_are_valid,
     assert_parameter_prior_type_is_valid,
     assert_parameter_scale_is_valid,
+    assert_unique_observable_ids,
     assert_unique_parameter_ids,
     check_ids,
-    check_measurement_df,
     check_observable_df,
     check_parameter_bounds,
 )
-from ..v1.measurements import split_parameter_replacement_list
+from ..v1.measurements import (
+    assert_overrides_match_parameter_count,
+    split_parameter_replacement_list,
+)
 from ..v1.observables import get_output_parameters, get_placeholders
 from ..v1.visualize.lint import validate_visualization_df
 from ..v2.C import *
@@ -237,8 +243,51 @@ def run(self, problem: Problem) -> ValidationIssue | None:
         if problem.measurement_df is None:
             return
 
+        df = problem.measurement_df
         try:
-            check_measurement_df(problem.measurement_df, problem.observable_df)
+            _check_df(df, MEASUREMENT_DF_REQUIRED_COLS, "measurement")
+
+            for column_name in MEASUREMENT_DF_REQUIRED_COLS:
+                if not np.issubdtype(df[column_name].dtype, np.number):
+                    assert_no_leading_trailing_whitespace(
+                        df[column_name].values, column_name
+                    )
+
+            for column_name in MEASUREMENT_DF_OPTIONAL_COLS:
+                if column_name in df and not np.issubdtype(
+                    df[column_name].dtype, np.number
+                ):
+                    assert_no_leading_trailing_whitespace(
+                        df[column_name].values, column_name
+                    )
+
+            if problem.observable_df is not None:
+                assert_measured_observables_defined(df, problem.observable_df)
+                assert_overrides_match_parameter_count(
+                    df, problem.observable_df
+                )
+
+                if OBSERVABLE_TRANSFORMATION in problem.observable_df:
+                    # Check for positivity of measurements in case of
+                    #  log-transformation
+                    assert_unique_observable_ids(problem.observable_df)
+                    # If the above is not checked, in the following loop
+                    # trafo may become a pandas Series
+                    for measurement, obs_id in zip(
+                        df[MEASUREMENT], df[OBSERVABLE_ID], strict=True
+                    ):
+                        trafo = problem.observable_df.loc[
+                            obs_id, OBSERVABLE_TRANSFORMATION
+                        ]
+                        if measurement <= 0.0 and trafo in [LOG, LOG10]:
+                            raise ValueError(
+                                "Measurements with observable "
+                                f"transformation {trafo} must be "
+                                f"positive, but {measurement} <= 0."
+                            )
+
+            assert_measurements_not_null(df)
+            assert_measurements_numeric(df)
         except AssertionError as e:
             return ValidationError(str(e))
 
@@ -247,46 +296,21 @@ def run(self, problem: Problem) -> ValidationIssue | None:
         #  condition table should be an error if the measurement table refers
         #  to conditions
 
-        # check that measured experiments/conditions exist
+        # check that measured experiments
         # TODO: fully switch to experiment table and remove this:
-        if SIMULATION_CONDITION_ID in problem.measurement_df:
-            if problem.condition_df is None:
-                return
-            used_conditions = set(
-                problem.measurement_df[SIMULATION_CONDITION_ID].dropna().values
-            )
-            if PREEQUILIBRATION_CONDITION_ID in problem.measurement_df:
-                used_conditions |= set(
-                    problem.measurement_df[PREEQUILIBRATION_CONDITION_ID]
-                    .dropna()
-                    .values
-                )
-            available_conditions = set(
-                problem.condition_df[CONDITION_ID].unique()
-            )
-            if missing_conditions := (used_conditions - available_conditions):
-                return ValidationError(
-                    "Measurement table references conditions that "
-                    "are not specified in the condition table: "
-                    + str(missing_conditions)
-                )
-        elif EXPERIMENT_ID in problem.measurement_df:
-            if problem.experiment_df is None:
-                return
-            used_experiments = set(
-                problem.measurement_df[EXPERIMENT_ID].values
-            )
-            available_experiments = set(
-                problem.condition_df[CONDITION_ID].unique()
+
+        if problem.experiment_df is None:
+            return
+        used_experiments = set(problem.measurement_df[EXPERIMENT_ID].values)
+        available_experiments = set(
+            problem.condition_df[CONDITION_ID].unique()
+        )
+        if missing_experiments := (used_experiments - available_experiments):
+            raise AssertionError(
+                "Measurement table references experiments that "
+                "are not specified in the experiments table: "
+                + str(missing_experiments)
             )
-            if missing_experiments := (
-                used_experiments - available_experiments
-            ):
-                raise AssertionError(
-                    "Measurement table references experiments that "
-                    "are not specified in the experiments table: "
-                    + str(missing_experiments)
-                )
 
 
 class CheckConditionTable(ValidationTask):
diff --git a/petab/v2/petab1to2.py b/petab/v2/petab1to2.py
@@ -4,8 +4,8 @@
 from itertools import chain
 from pathlib import Path
 from urllib.parse import urlparse
+from uuid import uuid4
 
-import numpy as np
 import pandas as pd
 from pandas.io.common import get_handle, is_url
 
@@ -98,10 +98,81 @@ def petab1to2(yaml_config: Path | str, output_dir: Path | str = None):
             condition_df = v1v2_condition_df(condition_df, petab_problem.model)
             v2.write_condition_df(condition_df, get_dest_path(condition_file))
 
+        # records for the experiment table to be created
+        experiments = []
+
+        def create_experiment_id(sim_cond_id: str, preeq_cond_id: str) -> str:
+            # TODO: can a condition ID be used as an experiment ID if there
+            #  would be only a single condition in the experiment?
+            if not sim_cond_id and not preeq_cond_id:
+                return ""
+            if preeq_cond_id:
+                preeq_cond_id = f"{preeq_cond_id}_"
+            exp_id = f"experiment_{preeq_cond_id}{sim_cond_id}"
+            if exp_id in experiments:  # noqa: B023
+                i = 1
+                while f"{exp_id}_{i}" in experiments:  # noqa: B023
+                    i += 1
+                exp_id = f"{exp_id}_{i}"
+            return exp_id
+
+        measured_experiments = (
+            petab_problem.get_simulation_conditions_from_measurement_df()
+        )
+        for (
+            _,
+            row,
+        ) in measured_experiments.iterrows():
+            sim_cond_id = row[v1.C.SIMULATION_CONDITION_ID]
+            preeq_cond_id = row.get(v1.C.PREEQUILIBRATION_CONDITION_ID, "")
+            exp_id = create_experiment_id(sim_cond_id, preeq_cond_id)
+            if preeq_cond_id:
+                experiments.append(
+                    {
+                        v2.C.EXPERIMENT_ID: exp_id,
+                        v2.C.CONDITION_ID: preeq_cond_id,
+                        v2.C.TIME: float("-inf"),
+                    }
+                )
+            experiments.append(
+                {
+                    v2.C.EXPERIMENT_ID: exp_id,
+                    v2.C.CONDITION_ID: sim_cond_id,
+                    v2.C.TIME: 0,
+                }
+            )
+        if experiments:
+            exp_table_path = output_dir / "experiments.tsv"
+            if exp_table_path.exists():
+                raise ValueError(
+                    f"Experiment table file {exp_table_path} already exists."
+                )
+            problem_config[v2.C.EXPERIMENT_FILES] = [exp_table_path.name]
+            v2.write_experiment_df(
+                v2.get_experiment_df(pd.DataFrame(experiments)), exp_table_path
+            )
+
         for measurement_file in problem_config.get(v2.C.MEASUREMENT_FILES, []):
             measurement_df = v1.get_measurement_df(
                 get_src_path(measurement_file)
             )
+            # if there is already an experiment ID column, we rename it
+            if v2.C.EXPERIMENT_ID in measurement_df.columns:
+                measurement_df.rename(
+                    columns={v2.C.EXPERIMENT_ID: f"experiment_id_{uuid4()}"},
+                    inplace=True,
+                )
+            # add pre-eq condition id if not present or convert to string
+            #  for simplicity
+            if v1.C.PREEQUILIBRATION_CONDITION_ID in measurement_df.columns:
+                measurement_df[
+                    v1.C.PREEQUILIBRATION_CONDITION_ID
+                ] = measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID].astype(
+                    str
+                )
+            else:
+                measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] = ""
+
             if (
                 petab_problem.condition_df is not None
                 and len(
@@ -112,18 +183,29 @@ def petab1to2(yaml_config: Path | str, output_dir: Path | str = None):
             ):
                 # can't have "empty" conditions with no overrides in v2
                 # TODO: this needs to be done condition wise
-                measurement_df[v2.C.SIMULATION_CONDITION_ID] = np.nan
+                measurement_df[v1.C.SIMULATION_CONDITION_ID] = ""
                 if (
                     v1.C.PREEQUILIBRATION_CONDITION_ID
                     in measurement_df.columns
                 ):
-                    measurement_df[v2.C.PREEQUILIBRATION_CONDITION_ID] = np.nan
+                    measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] = ""
+            # condition IDs to experiment IDs
+            measurement_df.insert(
+                0,
+                v2.C.EXPERIMENT_ID,
+                measurement_df.apply(
+                    lambda row: create_experiment_id(
+                        row[v1.C.SIMULATION_CONDITION_ID],
+                        row.get(v1.C.PREEQUILIBRATION_CONDITION_ID, ""),
+                    ),
+                    axis=1,
+                ),
+            )
+            del measurement_df[v1.C.SIMULATION_CONDITION_ID]
+            del measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID]
             v2.write_measurement_df(
                 measurement_df, get_dest_path(measurement_file)
             )
-    # TODO: Measurements: preequilibration to experiments/timecourses once
-    #  finalized
-    ...
 
     # validate updated Problem
     validation_issues = v2.lint_problem(new_yaml_file)
diff --git a/petab/v2/problem.py b/petab/v2/problem.py
@@ -908,27 +908,25 @@ def add_parameter(
     def add_measurement(
         self,
         obs_id: str,
-        sim_cond_id: str,
+        experiment_id: str,
         time: float,
         measurement: float,
         observable_parameters: Sequence[str] = None,
         noise_parameters: Sequence[str] = None,
-        preeq_cond_id: str = None,
     ):
         """Add a measurement to the problem.
 
         Arguments:
             obs_id: The observable ID
-            sim_cond_id: The simulation condition ID
+            experiment_id: The experiment ID
             time: The measurement time
             measurement: The measurement value
             observable_parameters: The observable parameters
             noise_parameters: The noise parameters
-            preeq_cond_id: The pre-equilibration condition ID
         """
         record = {
             OBSERVABLE_ID: [obs_id],
-            SIMULATION_CONDITION_ID: [sim_cond_id],
+            EXPERIMENT_ID: [experiment_id],
             TIME: [time],
             MEASUREMENT: [measurement],
         }
@@ -940,8 +938,6 @@ def add_measurement(
             record[NOISE_PARAMETERS] = [
                 PARAMETER_SEPARATOR.join(noise_parameters)
             ]
-        if preeq_cond_id is not None:
-            record[PREEQUILIBRATION_CONDITION_ID] = [preeq_cond_id]
 
         tmp_df = pd.DataFrame(record)
         self.measurement_df = (
diff --git a/tests/v1/test_petab.py b/tests/v1/test_petab.py
@@ -44,14 +44,14 @@ def petab_problem():
     petab_problem = petab.Problem()
     petab_problem.add_measurement(
         obs_id="obs1",
-        sim_cond_id="condition1",
+        experiment_id="experiment1",
         time=1.0,
         measurement=0.1,
         noise_parameters=["p3", "p4"],
     )
     petab_problem.add_measurement(
         obs_id="obs2",
-        sim_cond_id="condition2",
+        experiment_id="experiment2",
         time=1.0,
         measurement=0.2,
         observable_parameters=["p1", "p2"],
@@ -63,6 +63,9 @@ def petab_problem():
         "condition2", fixedParameter1=2.0, name="Condition 2"
     )
 
+    petab_problem.add_experiment("experiment1", 0, "condition1")
+    petab_problem.add_experiment("experiment2", 0, "condition2")
+
     petab_problem.add_parameter("dynamicParameter1", estimate=1)
     petab_problem.add_parameter("dynamicParameter2", estimate=0, name="...")
 
@@ -87,6 +90,11 @@ def petab_problem():
             petab_problem.condition_df, condition_file_name
         )
 
+        experiment_file_name = Path(temp_dir, "experiments.tsv")
+        petab.write_experiment_df(
+            petab_problem.experiment_df, experiment_file_name
+        )
+
         parameter_file_name = Path(temp_dir, "parameters.tsv")
         petab.write_parameter_df(
             petab_problem.parameter_df, parameter_file_name
diff --git a/tests/v2/test_problem.py b/tests/v2/test_problem.py