Unity-Technologies · vincentpierre · Aug 20, 2020 · Apr 14, 2020 · Apr 16, 2020 · Apr 17, 2020
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
@@ -29,6 +29,10 @@ and this project adheres to
 - The interaction between EnvManager and TrainerController was changed; EnvManager.advance() was split into to stages,
 and TrainerController now uses the results from the first stage to handle new behavior names. This change speeds up
 Python training by approximately 5-10%. (#4259)
+- Experimental PyTorch support has been added. Use `--torch` when running `mlagents-learn`, or add
+`framework: pytorch` to your trainer configuration (under the behavior name) to enable it.
+Note that PyTorch 1.6.0 or greater should be installed to use this feature; see
+[the PyTorch website](https://pytorch.org/) for installation instructions. (#4335)
 
 ### Minor Changes
 #### com.unity.ml-agents (C#)

diff --git a/docs/Learning-Environment-Examples.md b/docs/Learning-Environment-Examples.md
@@ -460,7 +460,7 @@ you would like to contribute environments, please see our
   head, thighs, shins, feet, arms, forearms and hands.
 - Goal: The agents must move its body toward the goal direction without falling.
   - `WalkerDynamic`- Goal direction is randomized.
-  - `WalkerDynamicVariableSpeed`- Goal direction and walking speed are randomized. 
+  - `WalkerDynamicVariableSpeed`- Goal direction and walking speed are randomized.
   - `WalkerStatic` - Goal direction is always forward.
   - `WalkerStaticVariableSpeed` - Goal direction is always forward. Walking
      speed is randomized

diff --git a/ml-agents/mlagents/trainers/buffer.py b/ml-agents/mlagents/trainers/buffer.py
@@ -48,7 +48,7 @@ def extend(self, data: np.ndarray) -> None:
             Adds a list of np.arrays to the end of the list of np.arrays.
             :param data: The np.array list to append.
             """
-            self += list(np.array(data))
+            self += list(np.array(data, dtype=np.float32))
 
         def set(self, data):
             """

diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py
@@ -168,6 +168,13 @@ def _create_parser() -> argparse.ArgumentParser:
         action=DetectDefaultStoreTrue,
         help="Forces training using CPU only",
     )
+    argparser.add_argument(
+        "--torch",
+        default=False,
+        action=DetectDefaultStoreTrue,
+        help="(Experimental) Use the PyTorch framework instead of TensorFlow. Install PyTorch "
+        "before using this option",
+    )
 
     eng_conf = argparser.add_argument_group(title="Engine Configuration")
     eng_conf.add_argument(

diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py
@@ -304,7 +304,10 @@ def save_model(self) -> None:
         self.trainer.save_model()
 
     def create_policy(
-        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
+        self,
+        parsed_behavior_id: BehaviorIdentifiers,
+        behavior_spec: BehaviorSpec,
+        create_graph: bool = False,
     ) -> Policy:
         """
         Creates policy with the wrapped trainer's create_policy function
@@ -313,10 +316,10 @@ def create_policy(
         team are grouped. All policies associated with this team are added to the
         wrapped trainer to be trained.
         """
-        policy = self.trainer.create_policy(parsed_behavior_id, behavior_spec)
-        policy.create_tf_graph()
+        policy = self.trainer.create_policy(
+            parsed_behavior_id, behavior_spec, create_graph=True
+        )
         self.trainer.saver.initialize_or_load(policy)
-        policy.init_load_weights()
         team_id = parsed_behavior_id.team_id
         self.controller.subscribe_team_id(team_id, self)
 
@@ -326,7 +329,6 @@ def create_policy(
                 parsed_behavior_id, behavior_spec
             )
             self.trainer.add_policy(parsed_behavior_id, internal_trainer_policy)
-            internal_trainer_policy.init_load_weights()
             self.current_policy_snapshot[
                 parsed_behavior_id.brain_name
             ] = internal_trainer_policy.get_weights()

diff --git a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
@@ -0,0 +1,94 @@
+from typing import Dict, Optional, Tuple, List
+import torch
+import numpy as np
+
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.trajectory import SplitObservations
+from mlagents.trainers.torch.components.bc.module import BCModule
+from mlagents.trainers.torch.components.reward_providers import create_reward_provider
+
+from mlagents.trainers.policy.torch_policy import TorchPolicy
+from mlagents.trainers.optimizer import Optimizer
+from mlagents.trainers.settings import TrainerSettings
+from mlagents.trainers.torch.utils import ModelUtils
+
+
+class TorchOptimizer(Optimizer):  # pylint: disable=W0223
+    def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
+        super().__init__()
+        self.policy = policy
+        self.trainer_settings = trainer_settings
+        self.update_dict: Dict[str, torch.Tensor] = {}
+        self.value_heads: Dict[str, torch.Tensor] = {}
+        self.memory_in: torch.Tensor = None
+        self.memory_out: torch.Tensor = None
+        self.m_size: int = 0
+        self.global_step = torch.tensor(0)
+        self.bc_module: Optional[BCModule] = None
+        self.create_reward_signals(trainer_settings.reward_signals)
+        if trainer_settings.behavioral_cloning is not None:
+            self.bc_module = BCModule(
+                self.policy,
+                trainer_settings.behavioral_cloning,
+                policy_learning_rate=trainer_settings.hyperparameters.learning_rate,
+                default_batch_size=trainer_settings.hyperparameters.batch_size,
+                default_num_epoch=3,
+            )
+
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        pass
+
+    def create_reward_signals(self, reward_signal_configs):
+        """
+        Create reward signals
+        :param reward_signal_configs: Reward signal config.
+        """
+        for reward_signal, settings in reward_signal_configs.items():
+            # Name reward signals by string in case we have duplicates later
+            self.reward_signals[reward_signal.value] = create_reward_provider(
+                reward_signal, self.policy.behavior_spec, settings
+            )
+
+    def get_trajectory_value_estimates(
+        self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
+        vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
+        if self.policy.use_vis_obs:
+            visual_obs = []
+            for idx, _ in enumerate(
+                self.policy.actor_critic.network_body.visual_encoders
+            ):
+                visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
+                visual_obs.append(visual_ob)
+        else:
+            visual_obs = []
+
+        memory = torch.zeros([1, 1, self.policy.m_size])
+
+        vec_vis_obs = SplitObservations.from_observations(next_obs)
+        next_vec_obs = [
+            ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0)
+        ]
+        next_vis_obs = [
+            ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
+            for _vis_ob in vec_vis_obs.visual_observations
+        ]
+
+        value_estimates, next_memory = self.policy.actor_critic.critic_pass(
+            vector_obs, visual_obs, memory, sequence_length=batch.num_experiences
+        )
+
+        next_value_estimate, _ = self.policy.actor_critic.critic_pass(
+            next_vec_obs, next_vis_obs, next_memory, sequence_length=1
+        )
+
+        for name, estimate in value_estimates.items():
+            value_estimates[name] = estimate.detach().cpu().numpy()
+            next_value_estimate[name] = next_value_estimate[name].detach().cpu().numpy()
+
+        if done:
+            for k in next_value_estimate:
+                if not self.reward_signals[k].ignore_done:
+                    next_value_estimate[k] = 0.0
+
+        return value_estimates, next_value_estimate
diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py
@@ -152,6 +152,8 @@ def create_tf_graph(self) -> None:
         # We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
         # it will re-load the full graph
         self.initialize()
+        # Create assignment ops for Ghost Trainer
+        self.init_load_weights()
 
     def _create_encoder(
         self,