forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute_adapted_gae_on_postprocess_trajectory.py
150 lines (130 loc) · 5.53 KB
/
compute_adapted_gae_on_postprocess_trajectory.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
Adapted (time-dependent) GAE for PPO algorithm can be activated by setting
use_adapted_gae=True in the policy config. Additionally, it is required that
"callbacks" include the custom callback class in the Trainer's config.
Furthermore, the env must return in its info dictionary a key-value pair of
the form "d_ts": ... where the value is the length (time) of recent agent step.
This adapted, time-dependent computation of advantages may be useful in cases
where agent's actions take various times and thus time steps are not
equidistant (https://docdro.id/400TvlR)
"""
from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.evaluation.postprocessing import Postprocessing
from ray.rllib.utils.annotations import override
import numpy as np
class MyCallbacks(DefaultCallbacks):
@override(DefaultCallbacks)
def on_postprocess_trajectory(
self,
*,
worker,
episode,
agent_id,
policy_id,
policies,
postprocessed_batch,
original_batches,
**kwargs
):
super().on_postprocess_trajectory(
worker=worker,
episode=episode,
agent_id=agent_id,
policy_id=policy_id,
policies=policies,
postprocessed_batch=postprocessed_batch,
original_batches=original_batches,
**kwargs
)
if policies[policy_id].config.get("use_adapted_gae", False):
policy = policies[policy_id]
assert policy.config[
"use_gae"
], "Can't use adapted gae without use_gae=True!"
info_dicts = postprocessed_batch[SampleBatch.INFOS]
assert np.all(
["d_ts" in info_dict for info_dict in info_dicts]
), "Info dicts in sample batch must contain data 'd_ts' \
(=ts[i+1]-ts[i] length of time steps)!"
d_ts = np.array(
[np.float(info_dict.get("d_ts")) for info_dict in info_dicts]
)
assert np.all(
[e.is_integer() for e in d_ts]
), "Elements of 'd_ts' (length of time steps) must be integer!"
# Trajectory is actually complete -> last r=0.0.
if postprocessed_batch[SampleBatch.DONES][-1]:
last_r = 0.0
# Trajectory has been truncated -> last r=VF estimate of last obs.
else:
# Input dict is provided to us automatically via the Model's
# requirements. It's a single-timestep (last one in trajectory)
# input_dict.
# Create an input dict according to the Model's requirements.
input_dict = postprocessed_batch.get_single_step_input_dict(
policy.model.view_requirements, index="last"
)
last_r = policy._value(**input_dict)
gamma = policy.config["gamma"]
lambda_ = policy.config["lambda"]
vpred_t = np.concatenate(
[postprocessed_batch[SampleBatch.VF_PREDS], np.array([last_r])]
)
delta_t = (
postprocessed_batch[SampleBatch.REWARDS]
+ gamma ** d_ts * vpred_t[1:]
- vpred_t[:-1]
)
# This formula for the advantage is an adaption of
# "Generalized Advantage Estimation"
# (https://arxiv.org/abs/1506.02438) which accounts for time steps
# of irregular length (see proposal here ).
# NOTE: last time step delta is not required
postprocessed_batch[
Postprocessing.ADVANTAGES
] = generalized_discount_cumsum(delta_t, d_ts[:-1], gamma * lambda_)
postprocessed_batch[Postprocessing.VALUE_TARGETS] = (
postprocessed_batch[Postprocessing.ADVANTAGES]
+ postprocessed_batch[SampleBatch.VF_PREDS]
).astype(np.float32)
postprocessed_batch[Postprocessing.ADVANTAGES] = postprocessed_batch[
Postprocessing.ADVANTAGES
].astype(np.float32)
def generalized_discount_cumsum(
x: np.ndarray, deltas: np.ndarray, gamma: float
) -> np.ndarray:
"""Calculates the 'time-dependent' discounted cumulative sum over a
(reward) sequence `x`.
Recursive equations:
y[t] - gamma**deltas[t+1]*y[t+1] = x[t]
reversed(y)[t] - gamma**reversed(deltas)[t-1]*reversed(y)[t-1] =
reversed(x)[t]
Args:
x (np.ndarray): A sequence of rewards or one-step TD residuals.
deltas (np.ndarray): A sequence of time step deltas (length of time
steps).
gamma (float): The discount factor gamma.
Returns:
np.ndarray: The sequence containing the 'time-dependent' discounted
cumulative sums for each individual element in `x` till the end of
the trajectory.
Examples:
>>> x = np.array([0.0, 1.0, 2.0, 3.0])
>>> deltas = np.array([1.0, 4.0, 15.0])
>>> gamma = 0.9
>>> generalized_discount_cumsum(x, deltas, gamma)
... array([0.0 + 0.9^1.0*1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0,
... 1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0,
... 2.0 + 0.9^15.0*3.0,
... 3.0])
"""
reversed_x = x[::-1]
reversed_deltas = deltas[::-1]
reversed_y = np.empty_like(x)
reversed_y[0] = reversed_x[0]
for i in range(1, x.size):
reversed_y[i] = (
reversed_x[i] + gamma ** reversed_deltas[i - 1] * reversed_y[i - 1]
)
return reversed_y[::-1]