alan-turing-institute · sgreenbury · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/autoemulate/core/compare.py b/autoemulate/core/compare.py
@@ -71,6 +71,7 @@ def __init__(
         log_level: str = "progress_bar",
         tuning_metric: str | Metric = "r2",
         evaluation_metrics: list[str | Metric] | None = None,
+        n_samples: int = 1000,
     ):
         """
         Initialize the AutoEmulate class.
@@ -130,6 +131,9 @@ def __init__(
             Each entry can be a string shortcut or a MetricConfig object.
             IMPORTANT: The first metric in the list is used to
             determine the best model.
+        n_samples: int
+            Number of samples to generate to predict mean when emulator does not have a
+            mean directly available. Defaults to 1000.
         """
         Results.__init__(self)
         self.random_seed = random_seed
@@ -187,6 +191,7 @@ def __init__(
         # Set up logger and ModelSerialiser for saving models
         self.logger, self.progress_bar = get_configured_logger(log_level)
         self.model_serialiser = ModelSerialiser(self.logger)
+        self.n_samples = n_samples
 
         # Run compare
         self.compare()
@@ -417,6 +422,9 @@ def compare(self):
                                     n_splits=self.n_splits,
                                     shuffle=self.shuffle,
                                     transformed_emulator_params=self.transformed_emulator_params,
+                                    metric_params=MetricParams(
+                                        n_samples=self.n_samples
+                                    ),
                                 )
                                 mean_scores = [
                                     np.mean(score).item() for score in scores
@@ -484,7 +492,9 @@ def compare(self):
                                 n_bootstraps=self.n_bootstraps,
                                 device=self.device,
                                 metrics=self.evaluation_metrics,
-                                metric_params=MetricParams(y_train=train_val_y),
+                                metric_params=MetricParams(
+                                    n_samples=self.n_samples, y_train=train_val_y
+                                ),
                             )
                             test_metrics = bootstrap(
                                 transformed_emulator,
@@ -493,7 +503,9 @@ def compare(self):
                                 n_bootstraps=self.n_bootstraps,
                                 device=self.device,
                                 metrics=self.evaluation_metrics,
-                                metric_params=MetricParams(y_train=train_val_y),
+                                metric_params=MetricParams(
+                                    n_samples=self.n_samples, y_train=train_val_y
+                                ),
                             )
 
                             # Log all test metrics from test_metrics dictionary

diff --git a/autoemulate/core/model_selection.py b/autoemulate/core/model_selection.py
@@ -1,5 +1,6 @@
 import inspect
 import logging
+from dataclasses import replace
 
 import torch
 from sklearn.model_selection import BaseCrossValidator
@@ -61,6 +62,7 @@ def cross_validate(
     device: DeviceLike = "cpu",
     random_seed: int | None = None,
     metrics: list[Metric] | None = None,
+    metric_params: MetricParams | None = None,
 ):
     """
     Cross validate model performance using the given `cv` strategy.
@@ -85,6 +87,8 @@ def cross_validate(
         Optional random seed for reproducibility.
     metrics: list[TorchMetrics] | None
         List of metrics to compute. If None, uses r2 and rmse.
+    metric_params: MetricParams | None
+        Additional parameters to pass to the metrics. Defaults to None.
 
     Returns
     -------
@@ -94,6 +98,7 @@ def cross_validate(
     transformed_emulator_params = transformed_emulator_params or {}
     x_transforms = x_transforms or []
     y_transforms = y_transforms or []
+    metric_params = metric_params or MetricParams()
 
     # Setup metrics
     if metrics is None:
@@ -143,7 +148,13 @@ def cross_validate(
         # compute and save results
         y_pred = transformed_emulator.predict(x_val)
         for metric in metrics:
-            score = evaluate(y_pred, y_val, metric)
+            score = evaluate(
+                # Update metric_params with y_train in case required by metric
+                y_pred,
+                y_val,
+                metric,
+                metric_params=replace(metric_params, y_train=y),
+            )
             cv_results[metric.name].append(score)
     return cv_results
 

diff --git a/autoemulate/core/tuner.py b/autoemulate/core/tuner.py
@@ -6,7 +6,7 @@
 from torch.distributions import Transform
 
 from autoemulate.core.device import TorchDeviceMixin
-from autoemulate.core.metrics import Metric, get_metric
+from autoemulate.core.metrics import Metric, MetricParams, get_metric
 from autoemulate.core.model_selection import cross_validate
 from autoemulate.core.types import (
     DeviceLike,
@@ -74,6 +74,7 @@ def run(
         n_splits: int = 5,
         seed: int | None = None,
         shuffle: bool = True,
+        metric_params: MetricParams | None = None,
     ) -> tuple[list[list[float]], list[ModelParams]]:
         """
         Run randomised hyperparameter search for a given model.
@@ -97,6 +98,8 @@ def run(
         shuffle: bool
             Whether to shuffle data before splitting into cross validation folds.
             Defaults to True.
+        metric_params: MetricParams | None
+            Additional parameters to pass to the metrics. Defaults to None.
 
         Returns
         -------
@@ -130,6 +133,7 @@ def run(
                     device=self.device,
                     random_seed=None,
                     metrics=[self.tuning_metric],
+                    metric_params=metric_params,
                 )
 
                 # Reset retries following a successful cross_validation call

diff --git a/autoemulate/data/utils.py b/autoemulate/data/utils.py
@@ -192,6 +192,42 @@ def _denormalize(
     ) -> TensorLike:
         return (x * x_std) + x_mean
 
+    def output_to_tensor(
+        self,
+        output: OutputLike,
+        n_samples: int = 1000,
+        with_grad: bool = False,
+    ) -> torch.Tensor:
+        """Convert an output to a tensor (returns the mean if output is a distribution).
+
+        Parameters
+        ----------
+        output: OutputLike
+            The output to convert to a tensor.
+        n_samples: int
+            Number of samples to draw from the distribution. Defaults to 1000.
+        with_grad: bool
+            Whether to enable gradient calculation. Defaults to False.
+
+        Returns
+        -------
+        TensorLike
+            Tensor of shape `(n_batch, n_targets)` as input or the mean of the output if
+            output is a distribution.
+        """
+        if isinstance(output, TensorLike):
+            return output
+        try:
+            return output.mean
+        except Exception:
+            # Use sampling to get a mean if mean property not available
+            samples = (
+                output.rsample(torch.Size([n_samples]))
+                if with_grad
+                else output.sample(torch.Size([n_samples]))
+            )
+            return samples.mean(dim=0)
+
 
 def set_random_seed(seed: int = 42, deterministic: bool = True):
     """

diff --git a/autoemulate/emulators/__init__.py b/autoemulate/emulators/__init__.py
@@ -1,4 +1,5 @@
 from .base import Emulator
+from .conformal import ConformalMLP
 from .ensemble import EnsembleMLP, EnsembleMLPDropout
 from .gaussian_process.exact import (
     GaussianProcessCorrelatedMatern32,
@@ -26,6 +27,7 @@
 
 __all__ = [
     "MLP",
+    "ConformalMLP",
     "Emulator",
     "EnsembleMLP",
     "EnsembleMLPDropout",

diff --git a/autoemulate/emulators/base.py b/autoemulate/emulators/base.py
@@ -39,9 +39,19 @@ class Emulator(ABC, ValidationMixin, ConversionMixin, TorchDeviceMixin):
     supports_uq: bool = False
 
     @abstractmethod
-    def _fit(self, x: TensorLike, y: TensorLike): ...
+    def _fit(
+        self,
+        x: TensorLike,
+        y: TensorLike,
+        validation_data: tuple[TensorLike, TensorLike] | None = None,
+    ): ...
 
-    def fit(self, x: TensorLike, y: TensorLike):
+    def fit(
+        self,
+        x: TensorLike,
+        y: TensorLike,
+        validation_data: tuple[TensorLike, TensorLike] | None = None,
+    ):
         """Fit the emulator to the provided data."""
         # Ensure x and y are tensors and 2D
         x, y = self._convert_to_tensors(x, y)
@@ -58,7 +68,7 @@ def fit(self, x: TensorLike, y: TensorLike):
         y = self.y_transform(y) if self.y_transform is not None else y
 
         # Fit emulator
-        self._fit(x, y)
+        self._fit(x, y, validation_data)
         self.is_fitted_ = True
 
     @abstractmethod
@@ -152,18 +162,7 @@ def predict_mean(
         """
         x = self._ensure_with_grad(x, with_grad)
         y_pred = self._predict(x, with_grad)
-        if isinstance(y_pred, TensorLike):
-            return y_pred
-        try:
-            return y_pred.mean
-        except Exception:
-            # Use sampling to get a mean if mean property not available
-            samples = (
-                y_pred.rsample(torch.Size([n_samples]))
-                if with_grad
-                else y_pred.sample(torch.Size([n_samples]))
-            )
-            return samples.mean(dim=0)
+        return self.output_to_tensor(y_pred, n_samples)
 
     def predict_mean_and_variance(
         self, x: TensorLike, with_grad: bool = False, n_samples: int = 100
@@ -559,7 +558,12 @@ def loss_func(self, y_pred, y_true):
         """Loss function to be used for training the model."""
         return nn.MSELoss()(y_pred, y_true)
 
-    def _fit(self, x: TensorLike, y: TensorLike):
+    def _fit(
+        self,
+        x: TensorLike,
+        y: TensorLike,
+        validation_data: tuple[TensorLike, TensorLike] | None = None,  # noqa: ARG002
+    ):
         """
         Train a PyTorchBackend model.
 
@@ -683,7 +687,12 @@ class SklearnBackend(DeterministicEmulator):
     def _model_specific_check(self, x: NumpyLike, y: NumpyLike):
         _, _ = x, y
 
-    def _fit(self, x: TensorLike, y: TensorLike):
+    def _fit(
+        self,
+        x: TensorLike,
+        y: TensorLike,
+        validation_data: tuple[TensorLike, TensorLike] | None = None,  # noqa: ARG002
+    ):
         if self.normalize_y:
             y, y_mean, y_std = self._normalize(y)
             self.y_mean = y_mean