ray-project · amogkam · Apr 28, 2022 · Apr 21, 2022 · Apr 21, 2022 · Apr 21, 2022
diff --git a/python/ray/ml/train/data_parallel_trainer.py b/python/ray/ml/train/data_parallel_trainer.py
@@ -1,10 +1,11 @@
 import inspect
 import logging
 from pathlib import Path
-from typing import Dict, Callable, Optional, Union
+from typing import Dict, Callable, List, Optional, Union, TYPE_CHECKING
 
 import ray
 from ray import tune
+from ray.actor import ActorHandle
 from ray.ml.constants import TRAIN_DATASET_KEY, PREPROCESSOR_KEY
 from ray.ml.trainer import Trainer
 from ray.ml.config import ScalingConfig, RunConfig
@@ -14,9 +15,13 @@
 from ray.train import BackendConfig, TrainingIterator
 from ray.train.backend import BackendExecutor
 from ray.train.checkpoint import TuneCheckpointManager
+from ray.train.impl.dataset_spec import _RayDatasetSpec
 from ray.train.utils import construct_train_func
 from ray.util.annotations import DeveloperAPI
 
+if TYPE_CHECKING:
+    from ray.data import Dataset
+
 logger = logging.getLogger(__name__)
 
 
@@ -292,27 +297,17 @@ def training_loop(self) -> None:
         else:
             resume_checkpoint_dict = None
 
-        # Tell Ray Train to only shard the train dataset and not the other datasets.
-        # This is purely an implementation detail and users do not need to know about
-        # this.
-        # TODO(amog): Refactor this to remove hack and make this more modular.
-        #  TrainingIterator should accept a generic custom_ingest_func that contains
-        #  the logic for how to split the Datasets.
-        updated_dataset_dict = {}
-        for key, value in self.datasets.items():
-            if key == TRAIN_DATASET_KEY:
-                updated_dataset_dict[key] = value
-            else:
-                # Ray Train will strip out the added string before exposing to users.
-                updated_dataset_dict[key + "_NO-SHARD"] = value
+        dataset_spec = _RayDatasetSpec(
+            dataset_or_dict=self.datasets, dataset_split_fn=_default_dataset_split_fn
+        )
 
         # TODO(amog): Have TrainingIterator also accept a checkpoint ObjectRef instead
         #  of just a Dict.
         training_iterator = TrainingIterator(
             backend_executor=backend_executor,
             backend_config=self.backend_config,
             train_func=train_loop_per_worker,
-            dataset=updated_dataset_dict if len(updated_dataset_dict) > 0 else None,
+            dataset_spec=dataset_spec,
             checkpoint_manager=checkpoint_manager,
             checkpoint=resume_checkpoint_dict,
             checkpoint_strategy=None,
@@ -348,3 +343,39 @@ def write_checkpoint(self, checkpoint: Dict):
     @property
     def latest_checkpoint_dir(self) -> Optional[Path]:
         raise NotImplementedError
+
+
+def _default_dataset_split_fn(
+    dataset_dict: Dict[str, "Dataset"], training_worker_handles: List[ActorHandle]
+) -> List[Dict[str, "Dataset"]]:
+    """Defines splitting logic of Datasets passed into ``DataParallelTrainer``.
+
+    By default only training dataset will be split. All other datasets will not be
+    split and passed through directly to the training workers. This is because
+    validation implementation is often done on just the rank 0 worker.
+
+    Args:
+        dataset_dict: A dictionary of Datasets.
+        training_worker_handles: The actor handles of the training workers to use for
+            locality hints.
+
+    Returns:
+        A list of dataset dictionaries for each training worker.
+    """
+    dataset_dict_splits = [{} for _ in range(len(training_worker_handles))]
+
+    for key, dataset in dataset_dict.items():
+        if key == TRAIN_DATASET_KEY:
+            dataset_splits = dataset.split(
+                len(training_worker_handles),
+                equal=True,
+                locality_hints=training_worker_handles,
+            )
+        else:
+            # Only shard the training dataset.
+            dataset_splits = [dataset] * len(training_worker_handles)
+
+        for i in range(len(dataset_splits)):
+            dataset_dict_splits[i][key] = dataset_splits[i]
+
+    return dataset_dict_splits
diff --git a/python/ray/ml/trainer.py b/python/ray/ml/trainer.py
@@ -257,9 +257,9 @@ def preprocess_datasets(self) -> None:
         If the ``Trainer`` has both a datasets dict and
         a preprocessor, the datasets dict contains a training dataset (denoted by
         the "train" key), and the preprocessor has not yet
-        been fit, then it will be fit on the train.
+        been fit, then it will be fit on the train dataset.
 
-        Then, the Trainer's datasets will be transformed by the preprocessor.
+        Then, all Trainer's datasets will be transformed by the preprocessor.
 
         The transformed datasets will be set back in the ``self.datasets`` attribute
         of the Trainer to be used when overriding ``training_loop``.

@@ -1,7 +1,7 @@
 import logging
 import os
 from collections import defaultdict
-from typing import Callable, TypeVar, List, Optional, Dict, Union, Type, Tuple
+from typing import Callable, TypeVar, List, Optional, Dict, Type, Tuple
 
 import ray
 from ray.exceptions import RayActorError
@@ -12,9 +12,10 @@
     TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV,
     TRAIN_ENABLE_WORKER_SPREAD_ENV,
 )
+from ray.train.impl.dataset_spec import _RayDatasetSpec
 from ray.train.session import TrainingResult
 from ray.train.session import init_session, get_session, shutdown_session
-from ray.train.utils import RayDataset, check_for_failure, Singleton
+from ray.train.utils import check_for_failure, Singleton
 from ray.train.worker_group import WorkerGroup
 from ray.util.annotations import DeveloperAPI
 from ray.util.placement_group import get_current_placement_group, remove_placement_group
@@ -314,60 +315,22 @@ def _create_local_rank_map(self) -> Dict:
             ip_dict[node_ip] += 1
         return rank_mapping
 
-    def _get_dataset_shards(self, dataset_or_dict):
-
-        if dataset_or_dict is None:
-            # Return None for each shard.
-            return [None] * len(self.worker_group)
-
-        def split_dataset(dataset_or_pipeline):
-            actors = [worker.actor for worker in self.worker_group.workers]
-            return dataset_or_pipeline.split(
-                len(self.worker_group), equal=True, locality_hints=actors
-            )
-
-        if isinstance(dataset_or_dict, dict):
-            # Return a smaller dict for each shard.
-            dataset_shards = [{} for _ in range(len(self.worker_group))]
-            # TODO(amog): Update Backend to accept a generic function with logic on
-            #  how to split dataset, instead of having to support _NO-SHARD in key.
-            for key, dataset in dataset_or_dict.items():
-                if "_NO-SHARD" in key:
-                    # Do not shard this dataset.
-                    split_datasets = [dataset] * len(self.worker_group)
-                    key = key.replace("_NO-SHARD", "")
-                else:
-                    split_datasets = split_dataset(dataset)
-                assert len(split_datasets) == len(self.worker_group)
-                for i in range(len(split_datasets)):
-                    dataset_shards[i][key] = split_datasets[i]
-            return dataset_shards
-        else:
-            # return a smaller RayDataset for each shard.
-            return split_dataset(dataset_or_dict)
-
     def start_training(
         self,
         train_func: Callable[[], T],
-        dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None,
+        dataset_spec: _RayDatasetSpec,
         checkpoint: Optional[Dict] = None,
     ) -> None:
         """Executes a training function on all workers in a separate thread.
 
         ``finish_training`` should be called after this.
 
         Args:
-            train_func (Callable): The training function to run on each worker.
-            dataset (Optional[Union[Dataset, DatasetPipeline]])
-                Distributed Ray Dataset or DatasetPipeline to pass into
-                worker, which can be accessed from the training function via
-                ``train.get_dataset_shard()``. Sharding will automatically be
-                handled by the Trainer. Multiple Datasets can be passed in as
-                a ``Dict`` that maps each name key to a Dataset value,
-                and each Dataset can be accessed from the training function
-                by passing in a `dataset_name` argument to
-                ``train.get_dataset_shard()``.
-            checkpoint (Optional[Dict]): The checkpoint data that
+            train_func: The training function to run on each worker.
+            dataset_spec: A specification for the Ray Dataset to be
+                passed to the training workers, and the logic on how to shard the Ray
+                Dataset.
+            checkpoint: The checkpoint data that
                 should be loaded onto each worker and accessed by the
                 training function via ``train.load_checkpoint()``. If this
                 is ``None`` then no checkpoint will be loaded.
@@ -406,7 +369,8 @@ def initialize_session(
                 )
 
         if self.dataset_shards is None:
-            self.dataset_shards = self._get_dataset_shards(dataset)
+            actors = [worker.actor for worker in self.worker_group.workers]
+            self.dataset_shards = dataset_spec.get_dataset_shards(actors)
 
         local_rank_map = self._create_local_rank_map()
 

@@ -0,0 +1,93 @@
+from dataclasses import dataclass
+from typing import Optional, Union, Dict, Callable, List, TYPE_CHECKING
+
+from ray.actor import ActorHandle
+
+if TYPE_CHECKING:
+    from ray.data import Dataset, DatasetPipeline
+
+RayDataset = Union["Dataset", "DatasetPipeline"]
+
+
+@dataclass
+class _RayDatasetSpec:
+    """Configuration for Ray Datasets to pass to the training workers.
+
+    dataset_or_dict: An optional Ray Dataset (or DatasetPipeline) or a dictionary of
+        datasets to be sharded across all the training workers, which can be accessed
+        from the training function via ``train.get_dataset_shard()``. Multiple Datasets
+        can be passed in as a dictionary that maps each name key to a Dataset value,
+        and each Dataset can be accessed from the training function by passing in a
+        `dataset_name` argument to ``train.get_dataset_shard()``.
+    dataset_split_fn: An optional callable to specify how the provided ``dataset``
+        should be split across the training workers. It is expected to take in two
+        arguments. The first one is the ``dataset``, just as is passed in to the
+        ``_RayDatasetSpec``. The second argument is a list of the ActorHandles of the
+        training workers (to use as locality hints). The Callable is expected to
+        return a list of RayDatasets or a list of dictionaries of RayDatasets,
+        with the length of the list equal to the length of the list of actor handles.
+        If None is provided, the provided Ray Dataset(s) will be simply be split using
+        the actor handles as locality hints.
+
+    """
+
+    dataset_or_dict: Optional[Union[RayDataset, Dict[str, RayDataset]]]
+    dataset_split_fn: Optional[
+        Callable[
+            [Union[RayDataset, Dict[str, RayDataset]], List[ActorHandle]],
+            List[Union[RayDataset, Dict[str, RayDataset]]],
+        ]
+    ] = None
+
+    def _default_split_fn(
+        self, training_worker_handles: List[ActorHandle]
+    ) -> List[Optional[Union[RayDataset, Dict[str, RayDataset]]]]:
+        def split_dataset(dataset_or_pipeline):
+            return dataset_or_pipeline.split(
+                len(training_worker_handles),
+                equal=True,
+                locality_hints=training_worker_handles,
+            )
+
+        if isinstance(self.dataset_or_dict, dict):
+            # Return a smaller dict for each shard.
+            dataset_shards = [{} for _ in range(len(training_worker_handles))]
+            for key, dataset in self.dataset_or_dict.items():
+                split_datasets = split_dataset(dataset)
+                assert len(split_datasets) == len(training_worker_handles)
+                for i in range(len(split_datasets)):
+                    dataset_shards[i][key] = split_datasets[i]
+            return dataset_shards
+        else:
+            # return a smaller RayDataset for each shard.
+            return split_dataset(self.dataset_or_dict)
+
+    def get_dataset_shards(
+        self, training_worker_handles: List[ActorHandle]
+    ) -> List[Optional[Union[RayDataset, Dict[str, RayDataset]]]]:
+        """Returns Dataset splits based off the spec and the given training workers
+
+        Args:
+            training_worker_handles: A list of the training worker actor handles.
+
+        Returns:
+            A list of RayDataset shards or list of dictionaries of RayDataset shards,
+                one for each training worker.
+
+        """
+        if not self.dataset_or_dict:
+            return [None] * len(training_worker_handles)
+
+        if self.dataset_split_fn is None:
+            return self._default_split_fn(training_worker_handles)
+        else:
+            splits = self.dataset_split_fn(
+                self.dataset_or_dict, training_worker_handles
+            )
+            if not len(splits) == len(training_worker_handles):
+                raise RuntimeError(
+                    "The list of Datasets returned by the "
+                    f"`dataset_split_fn`: {len(splits)} does not match "
+                    f"the number of training workers: {len(training_worker_handles)}"
+                )
+            return splits
@@ -25,7 +25,8 @@
     RESULT_FETCH_TIMEOUT,
     SESSION_MISUSE_LOG_ONCE_KEY,
 )
-from ray.train.utils import PropagatingThread, RayDataset
+from ray.train.utils import PropagatingThread
+from ray.train.impl.dataset_spec import RayDataset
 from ray.util import PublicAPI, log_once