|
| 1 | +import pytest |
| 2 | + |
| 3 | +import numpy as np |
| 4 | + |
| 5 | +from mlagents.trainers.ghost.trainer import GhostTrainer |
| 6 | +from mlagents.trainers.ghost.controller import GhostController |
| 7 | +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers |
| 8 | +from mlagents.trainers.ppo.trainer import PPOTrainer |
| 9 | +from mlagents.trainers.agent_processor import AgentManagerQueue |
| 10 | +from mlagents.trainers.tests import mock_brain as mb |
| 11 | +from mlagents.trainers.tests.test_trajectory import make_fake_trajectory |
| 12 | +from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings, FrameworkType |
| 13 | + |
| 14 | + |
| 15 | +@pytest.fixture |
| 16 | +def dummy_config(): |
| 17 | + return TrainerSettings( |
| 18 | + self_play=SelfPlaySettings(), framework=FrameworkType.PYTORCH |
| 19 | + ) |
| 20 | + |
| 21 | + |
| 22 | +VECTOR_ACTION_SPACE = 1 |
| 23 | +VECTOR_OBS_SPACE = 8 |
| 24 | +DISCRETE_ACTION_SPACE = [3, 3, 3, 2] |
| 25 | +BUFFER_INIT_SAMPLES = 513 |
| 26 | +NUM_AGENTS = 12 |
| 27 | + |
| 28 | + |
| 29 | +@pytest.mark.parametrize("use_discrete", [True, False]) |
| 30 | +def test_load_and_set(dummy_config, use_discrete): |
| 31 | + mock_specs = mb.setup_test_behavior_specs( |
| 32 | + use_discrete, |
| 33 | + False, |
| 34 | + vector_action_space=DISCRETE_ACTION_SPACE |
| 35 | + if use_discrete |
| 36 | + else VECTOR_ACTION_SPACE, |
| 37 | + vector_obs_space=VECTOR_OBS_SPACE, |
| 38 | + ) |
| 39 | + |
| 40 | + trainer_params = dummy_config |
| 41 | + trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") |
| 42 | + trainer.seed = 1 |
| 43 | + policy = trainer.create_policy("test", mock_specs) |
| 44 | + trainer.seed = 20 # otherwise graphs are the same |
| 45 | + to_load_policy = trainer.create_policy("test", mock_specs) |
| 46 | + |
| 47 | + weights = policy.get_weights() |
| 48 | + load_weights = to_load_policy.get_weights() |
| 49 | + try: |
| 50 | + for w, lw in zip(weights, load_weights): |
| 51 | + np.testing.assert_array_equal(w, lw) |
| 52 | + except AssertionError: |
| 53 | + pass |
| 54 | + |
| 55 | + to_load_policy.load_weights(weights) |
| 56 | + load_weights = to_load_policy.get_weights() |
| 57 | + |
| 58 | + for w, lw in zip(weights, load_weights): |
| 59 | + np.testing.assert_array_equal(w, lw) |
| 60 | + |
| 61 | + |
| 62 | +def test_process_trajectory(dummy_config): |
| 63 | + mock_specs = mb.setup_test_behavior_specs( |
| 64 | + True, False, vector_action_space=[2], vector_obs_space=1 |
| 65 | + ) |
| 66 | + behavior_id_team0 = "test_brain?team=0" |
| 67 | + behavior_id_team1 = "test_brain?team=1" |
| 68 | + brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name |
| 69 | + |
| 70 | + ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") |
| 71 | + controller = GhostController(100) |
| 72 | + trainer = GhostTrainer( |
| 73 | + ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" |
| 74 | + ) |
| 75 | + |
| 76 | + # first policy encountered becomes policy trained by wrapped PPO |
| 77 | + parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) |
| 78 | + policy = trainer.create_policy(parsed_behavior_id0, mock_specs) |
| 79 | + trainer.add_policy(parsed_behavior_id0, policy) |
| 80 | + trajectory_queue0 = AgentManagerQueue(behavior_id_team0) |
| 81 | + trainer.subscribe_trajectory_queue(trajectory_queue0) |
| 82 | + |
| 83 | + # Ghost trainer should ignore this queue because off policy |
| 84 | + parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) |
| 85 | + policy = trainer.create_policy(parsed_behavior_id1, mock_specs) |
| 86 | + trainer.add_policy(parsed_behavior_id1, policy) |
| 87 | + trajectory_queue1 = AgentManagerQueue(behavior_id_team1) |
| 88 | + trainer.subscribe_trajectory_queue(trajectory_queue1) |
| 89 | + |
| 90 | + time_horizon = 15 |
| 91 | + trajectory = make_fake_trajectory( |
| 92 | + length=time_horizon, |
| 93 | + max_step_complete=True, |
| 94 | + observation_shapes=[(1,)], |
| 95 | + action_space=[2], |
| 96 | + ) |
| 97 | + trajectory_queue0.put(trajectory) |
| 98 | + trainer.advance() |
| 99 | + |
| 100 | + # Check that trainer put trajectory in update buffer |
| 101 | + assert trainer.trainer.update_buffer.num_experiences == 15 |
| 102 | + |
| 103 | + trajectory_queue1.put(trajectory) |
| 104 | + trainer.advance() |
| 105 | + |
| 106 | + # Check that ghost trainer ignored off policy queue |
| 107 | + assert trainer.trainer.update_buffer.num_experiences == 15 |
| 108 | + # Check that it emptied the queue |
| 109 | + assert trajectory_queue1.empty() |
| 110 | + |
| 111 | + |
| 112 | +def test_publish_queue(dummy_config): |
| 113 | + mock_specs = mb.setup_test_behavior_specs( |
| 114 | + True, False, vector_action_space=[1], vector_obs_space=8 |
| 115 | + ) |
| 116 | + |
| 117 | + behavior_id_team0 = "test_brain?team=0" |
| 118 | + behavior_id_team1 = "test_brain?team=1" |
| 119 | + |
| 120 | + parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) |
| 121 | + |
| 122 | + brain_name = parsed_behavior_id0.brain_name |
| 123 | + |
| 124 | + ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") |
| 125 | + controller = GhostController(100) |
| 126 | + trainer = GhostTrainer( |
| 127 | + ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" |
| 128 | + ) |
| 129 | + |
| 130 | + # First policy encountered becomes policy trained by wrapped PPO |
| 131 | + # This queue should remain empty after swap snapshot |
| 132 | + policy = trainer.create_policy(parsed_behavior_id0, mock_specs) |
| 133 | + trainer.add_policy(parsed_behavior_id0, policy) |
| 134 | + policy_queue0 = AgentManagerQueue(behavior_id_team0) |
| 135 | + trainer.publish_policy_queue(policy_queue0) |
| 136 | + |
| 137 | + # Ghost trainer should use this queue for ghost policy swap |
| 138 | + parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) |
| 139 | + policy = trainer.create_policy(parsed_behavior_id1, mock_specs) |
| 140 | + trainer.add_policy(parsed_behavior_id1, policy) |
| 141 | + policy_queue1 = AgentManagerQueue(behavior_id_team1) |
| 142 | + trainer.publish_policy_queue(policy_queue1) |
| 143 | + |
| 144 | + # check ghost trainer swap pushes to ghost queue and not trainer |
| 145 | + assert policy_queue0.empty() and policy_queue1.empty() |
| 146 | + trainer._swap_snapshots() |
| 147 | + assert policy_queue0.empty() and not policy_queue1.empty() |
| 148 | + # clear |
| 149 | + policy_queue1.get_nowait() |
| 150 | + |
| 151 | + mock_specs = mb.setup_test_behavior_specs( |
| 152 | + False, |
| 153 | + False, |
| 154 | + vector_action_space=VECTOR_ACTION_SPACE, |
| 155 | + vector_obs_space=VECTOR_OBS_SPACE, |
| 156 | + ) |
| 157 | + |
| 158 | + buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs) |
| 159 | + # Mock out reward signal eval |
| 160 | + buffer["extrinsic_rewards"] = buffer["environment_rewards"] |
| 161 | + buffer["extrinsic_returns"] = buffer["environment_rewards"] |
| 162 | + buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] |
| 163 | + buffer["curiosity_rewards"] = buffer["environment_rewards"] |
| 164 | + buffer["curiosity_returns"] = buffer["environment_rewards"] |
| 165 | + buffer["curiosity_value_estimates"] = buffer["environment_rewards"] |
| 166 | + buffer["advantages"] = buffer["environment_rewards"] |
| 167 | + trainer.trainer.update_buffer = buffer |
| 168 | + |
| 169 | + # when ghost trainer advance and wrapped trainer buffers full |
| 170 | + # the wrapped trainer pushes updated policy to correct queue |
| 171 | + assert policy_queue0.empty() and policy_queue1.empty() |
| 172 | + trainer.advance() |
| 173 | + assert not policy_queue0.empty() and policy_queue1.empty() |
| 174 | + |
| 175 | + |
| 176 | +if __name__ == "__main__": |
| 177 | + pytest.main() |
0 commit comments