-
Notifications
You must be signed in to change notification settings - Fork 6k
/
Copy pathdoubly_robust.py
71 lines (61 loc) · 2.91 KB
/
doubly_robust.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimate
from ray.rllib.offline.estimators.direct_method import DirectMethod, k_fold_cv
from ray.rllib.utils.annotations import DeveloperAPI, override
from ray.rllib.utils.typing import SampleBatchType
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils.numpy import convert_to_numpy
import numpy as np
@DeveloperAPI
class DoublyRobust(DirectMethod):
"""The Doubly Robust (DR) estimator.
DR estimator described in https://arxiv.org/pdf/1511.03722.pdf"""
@override(DirectMethod)
def estimate(
self, batch: SampleBatchType, should_train: bool = True
) -> OffPolicyEstimate:
self.check_can_estimate_for(batch)
estimates = []
# Split data into train and test using k-fold cross validation
for train_episodes, test_episodes in k_fold_cv(batch, self.k, should_train):
# Train Q-function
if train_episodes:
# Reinitialize model
self.model.reset()
train_batch = SampleBatch.concat_samples(train_episodes)
losses = self.train(train_batch)
self.losses.append(losses)
# Calculate doubly robust OPE estimates
for episode in test_episodes:
rewards, old_prob = episode["rewards"], episode["action_prob"]
new_prob = np.exp(self.action_log_likelihood(episode))
v_old = 0.0
v_new = 0.0
q_values = self.model.estimate_q(
episode[SampleBatch.OBS], episode[SampleBatch.ACTIONS]
)
q_values = convert_to_numpy(q_values)
all_actions = np.zeros([episode.count, self.policy.action_space.n])
all_actions[:] = np.arange(self.policy.action_space.n)
# Two transposes required for torch.distributions to work
tmp_episode = episode.copy()
tmp_episode[SampleBatch.ACTIONS] = all_actions.T
action_probs = np.exp(self.action_log_likelihood(tmp_episode)).T
v_values = self.model.estimate_v(episode[SampleBatch.OBS], action_probs)
v_values = convert_to_numpy(v_values)
for t in reversed(range(episode.count)):
v_old = rewards[t] + self.gamma * v_old
v_new = v_values[t] + (new_prob[t] / old_prob[t]) * (
rewards[t] + self.gamma * v_new - q_values[t]
)
v_new = v_new.item()
estimates.append(
OffPolicyEstimate(
self.name,
{
"v_old": v_old,
"v_new": v_new,
"v_gain": v_new / max(1e-8, v_old),
},
)
)
return estimates