Skip to content

[bug-fix] Fix crash when demo size is smaller than batch size #3591

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions com.unity.ml-agents/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- `DecisionRequester` has been made internal (you can still use the DecisionRequesterComponent from the inspector). `RepeatAction` was renamed `TakeActionsBetweenDecisions` for clarity. (#3555)
- The `IFloatProperties` interface has been removed.
- Fix #3579.
- Fixed an issue when using GAIL with less than `batch_size` number of demonstrations. (#3591)

## [0.14.1-preview] - 2020-02-25

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer

logger = logging.getLogger("mlagents.trainers")

Expand Down Expand Up @@ -39,7 +40,7 @@ def __init__(self, policy: TFPolicy, strength: float, gamma: float):
self.strength = strength
self.stats_name_to_update_name: Dict[str, str] = {}

def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
"""
Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward
function drawn straight from a Buffer.
Expand All @@ -54,7 +55,7 @@ def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
)

def prepare_update(
self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
) -> Dict[tf.Tensor, Any]:
"""
If the reward signal has an internal model (e.g. GAIL or Curiosity), get the feed_dict
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer


class CuriosityRewardSignal(RewardSignal):
Expand Down Expand Up @@ -41,7 +42,7 @@ def __init__(
}
self.has_updated = False

def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.sequence_length_ph: self.policy.sequence_length,
Expand Down Expand Up @@ -80,7 +81,7 @@ def check_config(
super().check_config(config_dict, param_keys)

def prepare_update(
self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
) -> Dict[tf.Tensor, Any]:
"""
Prepare for update and get feed_dict.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np

from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.buffer import AgentBuffer


class ExtrinsicRewardSignal(RewardSignal):
Expand All @@ -16,6 +17,6 @@ def check_config(
param_keys = ["strength", "gamma"]
super().check_config(config_dict, param_keys)

def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)
return RewardSignalResult(self.strength * env_rews, env_rews)
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from mlagents.trainers.policy.tf_policy import TFPolicy
from .model import GAILModel
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.buffer import AgentBuffer


class GAILRewardSignal(RewardSignal):
Expand Down Expand Up @@ -61,7 +62,7 @@ def __init__(
"Policy/GAIL Expert Estimate": "gail_expert_estimate",
}

def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.sequence_length_ph: self.policy.sequence_length,
Expand Down Expand Up @@ -101,24 +102,17 @@ def check_config(
super().check_config(config_dict, param_keys)

def prepare_update(
self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
) -> Dict[tf.Tensor, Any]:
"""
Prepare inputs for update. .
:param mini_batch_demo: A mini batch of expert trajectories
:param mini_batch_policy: A mini batch of trajectories sampled from the current policy
:return: Feed_dict for update process.
"""
max_num_experiences = min(
len(mini_batch["actions"]), self.demonstration_buffer.num_experiences
)
# If num_sequences is less, we need to shorten the input batch.
for key, element in mini_batch.items():
mini_batch[key] = element[:max_num_experiences]

# Get batch from demo buffer
# Get batch from demo buffer. Even if demo buffer is smaller, we sample with replacement
mini_batch_demo = self.demonstration_buffer.sample_mini_batch(
len(mini_batch["actions"]), 1
mini_batch.num_experiences, 1
)

feed_dict: Dict[tf.Tensor, Any] = {
Expand Down
4 changes: 2 additions & 2 deletions ml-agents/mlagents/trainers/sac/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
return update_stats

def update_reward_signals(
self, reward_signal_minibatches: Mapping[str, Dict], num_sequences: int
self, reward_signal_minibatches: Mapping[str, AgentBuffer], num_sequences: int
) -> Dict[str, float]:
"""
Only update the reward signals.
Expand Down Expand Up @@ -567,7 +567,7 @@ def add_reward_signal_dicts(
feed_dict: Dict[tf.Tensor, Any],
update_dict: Dict[str, tf.Tensor],
stats_needed: Dict[str, str],
reward_signal_minibatches: Mapping[str, Dict],
reward_signal_minibatches: Mapping[str, AgentBuffer],
num_sequences: int,
) -> None:
"""
Expand Down
66 changes: 66 additions & 0 deletions ml-agents/mlagents/trainers/tests/test_ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.tests.test_reward_signals import ( # noqa: F401; pylint: disable=unused-variable
curiosity_dummy_config,
gail_dummy_config,
)


@pytest.fixture
Expand Down Expand Up @@ -122,6 +126,68 @@ def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
)


@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
# We need to test this separately from test_reward_signals.py to ensure no interactions
def test_ppo_optimizer_update_curiosity(
curiosity_dummy_config, dummy_config, rnn, visual, discrete # noqa: F811
):
# Test evaluate
tf.reset_default_graph()
dummy_config["reward_signals"].update(curiosity_dummy_config)
optimizer = _create_ppo_optimizer_ops_mock(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)


# We need to test this separately from test_reward_signals.py to ensure no interactions
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811
# Test evaluate
tf.reset_default_graph()
dummy_config["reward_signals"].update(gail_dummy_config)
optimizer = _create_ppo_optimizer_ops_mock(
dummy_config, use_rnn=False, use_discrete=False, use_visual=False
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)

# Check if buffer size is too big
update_buffer = mb.simulate_rollout(3000, optimizer.policy.brain)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)


@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
Expand Down