Skip to content
This repository was archived by the owner on Dec 28, 2023. It is now read-only.

Commit 1183818

Browse files
author
Isaac Poulton
authored
Add continuous control (#8)
* Add normal distribution * Add NormalOutput output layer * Implement continuous control
1 parent 5a3a0fe commit 1183818

File tree

13 files changed

+265
-37
lines changed

13 files changed

+265
-37
lines changed

example/gym_client.cpp

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,24 @@ using namespace cpprl;
1515

1616
// Algorithm hyperparameters
1717
const std::string algorithm = "PPO";
18-
const int batch_size = 40;
18+
const int batch_size = 512;
1919
const float clip_param = 0.2;
2020
const float discount_factor = 0.99;
21-
const float entropy_coef = 1e-3;
22-
const float learning_rate = 1e-3;
21+
const float entropy_coef = 1e-5;
22+
const float gae = 0.95;
23+
const float learning_rate = 3e-4;
24+
const int log_interval = 1;
2325
const int num_epoch = 3;
24-
const int num_mini_batch = 20;
26+
const int num_mini_batch = 32;
2527
const int reward_average_window_size = 10;
2628
const bool use_gae = true;
2729
const float value_loss_coef = 0.5;
2830

2931
// Environment hyperparameters
30-
const std::string env_name = "LunarLander-v2";
31-
const int num_envs = 8;
3232
const float env_gamma = discount_factor; // Set to -1 to disable
33+
const std::string env_name = "BipedalWalker-v2";
34+
const int num_envs = 8;
35+
const float render_reward_threshold = 300;
3336

3437
// Model hyperparameters
3538
const int hidden_size = 64;
@@ -117,7 +120,7 @@ int main(int argc, char *argv[])
117120
base = std::make_shared<CnnBase>(env_info->observation_space_shape[0], recurrent, hidden_size);
118121
}
119122
base->to(device);
120-
ActionSpace space{"Discrete", env_info->action_space_shape};
123+
ActionSpace space{env_info->action_space_type, env_info->action_space_shape};
121124
Policy policy(space, base);
122125
policy->to(device);
123126
RolloutStorage storage(batch_size, num_envs, env_info->observation_space_shape, space, hidden_size, device);
@@ -152,11 +155,14 @@ int main(int argc, char *argv[])
152155
storage.get_masks()[step]);
153156
}
154157
auto actions_tensor = act_result[1].cpu();
155-
int64_t *actions_array = actions_tensor.data<int64_t>();
156-
std::vector<std::vector<int>> actions(num_envs);
158+
float *actions_array = actions_tensor.data<float>();
159+
std::vector<std::vector<float>> actions(num_envs);
157160
for (int i = 0; i < num_envs; ++i)
158161
{
159-
actions[i] = {static_cast<int>(actions_array[i])};
162+
for (int j = 0; j < env_info->action_space_shape[0]; j++)
163+
{
164+
actions[i].push_back(actions_array[i * env_info->action_space_shape[0] + j]);
165+
}
160166
}
161167

162168
auto step_param = std::make_shared<StepParam>();
@@ -219,12 +225,12 @@ int main(int argc, char *argv[])
219225
storage.get_masks()[-1])
220226
.detach();
221227
}
222-
storage.compute_returns(next_value, use_gae, discount_factor, 0.9);
228+
storage.compute_returns(next_value, use_gae, discount_factor, gae);
223229

224230
auto update_data = algo->update(storage);
225231
storage.after_update();
226232

227-
if (update % 10 == 0 && update > 0)
233+
if (update % log_interval == 0 && update > 0)
228234
{
229235
auto total_steps = (update + 1) * batch_size * num_envs;
230236
auto run_time = std::chrono::high_resolution_clock::now() - start_time;
@@ -241,7 +247,7 @@ int main(int argc, char *argv[])
241247
float average_reward = std::accumulate(reward_history.begin(), reward_history.end(), 0);
242248
average_reward /= episode_count < reward_average_window_size ? episode_count : reward_average_window_size;
243249
spdlog::info("Reward: {}", average_reward);
244-
render = average_reward > 180;
250+
render = average_reward >= render_reward_threshold;
245251
}
246252
}
247253
}

example/requests.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ struct ResetParam
3838

3939
struct StepParam
4040
{
41-
std::vector<std::vector<int>> actions;
41+
std::vector<std::vector<float>> actions;
4242
bool render;
4343
MSGPACK_DEFINE_MAP(actions, render);
4444
};

gym_server/server.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,8 @@ def step(self,
110110
if isinstance(self.env.action_space, gym.spaces.Discrete):
111111
actions = actions.squeeze(-1)
112112
observation, reward, done, info = self.env.step(actions)
113-
if isinstance(self.env.action_space, gym.spaces.Discrete):
114-
reward = np.expand_dims(reward, -1)
115-
done = np.expand_dims(done, -1)
113+
reward = np.expand_dims(reward, -1)
114+
done = np.expand_dims(done, -1)
116115
if render:
117116
self.env.render()
118117
return observation, reward, done, info

include/cpprl/distributions/distribution.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@ class Distribution
1010
virtual ~Distribution() = 0;
1111

1212
virtual torch::Tensor entropy() = 0;
13-
virtual torch::Tensor get_logits() = 0;
14-
virtual torch::Tensor get_probs() = 0;
1513
virtual torch::Tensor log_prob(torch::Tensor value) = 0;
1614
virtual torch::Tensor sample(c10::ArrayRef<int64_t> sample_shape = {}) = 0;
1715
};
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#pragma once
2+
3+
#include <c10/util/ArrayRef.h>
4+
#include <torch/torch.h>
5+
6+
#include "cpprl/distributions/distribution.h"
7+
8+
namespace cpprl
9+
{
10+
class Normal : public Distribution
11+
{
12+
private:
13+
torch::Tensor loc, scale;
14+
std::vector<int64_t> batch_shape, event_shape;
15+
16+
std::vector<int64_t> extended_shape(c10::ArrayRef<int64_t> sample_shape);
17+
18+
public:
19+
Normal(const torch::Tensor loc, const torch::Tensor scale);
20+
21+
torch::Tensor entropy();
22+
torch::Tensor log_prob(torch::Tensor value);
23+
torch::Tensor sample(c10::ArrayRef<int64_t> sample_shape = {});
24+
25+
inline torch::Tensor get_loc() { return loc; }
26+
inline torch::Tensor get_scale() { return scale; }
27+
};
28+
}

include/cpprl/model/output_layers.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,16 @@ class CategoricalOutput : public OutputLayer
3030

3131
std::unique_ptr<Distribution> forward(torch::Tensor x);
3232
};
33+
34+
class NormalOutput : public OutputLayer
35+
{
36+
private:
37+
nn::Linear linear_loc;
38+
torch::Tensor scale_log;
39+
40+
public:
41+
NormalOutput(unsigned int num_inputs, unsigned int num_outputs);
42+
43+
std::unique_ptr<Distribution> forward(torch::Tensor x);
44+
};
3345
}

include/cpprl/model/policy.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,18 @@
77

88
#include "cpprl/model/nn_base.h"
99
#include "cpprl/model/output_layers.h"
10+
#include "cpprl/spaces.h"
1011

1112
using namespace torch;
1213

1314
namespace cpprl
1415
{
15-
class ActionSpace;
16-
1716
class PolicyImpl : public nn::Module
1817
{
1918
private:
2019
std::shared_ptr<NNBase> base;
2120
std::shared_ptr<OutputLayer> output_layer;
21+
ActionSpace action_space;
2222

2323
std::vector<torch::Tensor> forward_gru(torch::Tensor x,
2424
torch::Tensor hxs,

src/distributions/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
target_sources(cpprl
22
PRIVATE
33
${CMAKE_CURRENT_LIST_DIR}/categorical.cpp
4+
${CMAKE_CURRENT_LIST_DIR}/normal.cpp
45
)
56

67
if (CPPRL_BUILD_TESTS)
78
target_sources(cpprl_tests
89
PRIVATE
910
${CMAKE_CURRENT_LIST_DIR}/categorical.cpp
11+
${CMAKE_CURRENT_LIST_DIR}/normal.cpp
1012
)
1113
endif (CPPRL_BUILD_TESTS)

src/distributions/normal.cpp

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#define _USE_MATH_DEFINES
2+
#include <math.h>
3+
#include <cmath>
4+
#include <limits>
5+
6+
#include <c10/util/ArrayRef.h>
7+
#include <torch/torch.h>
8+
9+
#include "cpprl/distributions/normal.h"
10+
#include "third_party/doctest.h"
11+
12+
namespace cpprl
13+
{
14+
Normal::Normal(const torch::Tensor loc,
15+
const torch::Tensor scale)
16+
{
17+
auto broadcasted_tensors = torch::broadcast_tensors({loc, scale});
18+
this->loc = broadcasted_tensors[0];
19+
this->scale = broadcasted_tensors[1];
20+
batch_shape = this->loc.sizes().vec();
21+
event_shape = {};
22+
}
23+
24+
torch::Tensor Normal::entropy()
25+
{
26+
return (0.5 + 0.5 * std::log(2 * M_PI) + torch::log(scale)).sum(-1);
27+
}
28+
29+
std::vector<int64_t> Normal::extended_shape(c10::ArrayRef<int64_t> sample_shape)
30+
{
31+
std::vector<int64_t> output_shape;
32+
output_shape.insert(output_shape.end(),
33+
sample_shape.begin(),
34+
sample_shape.end());
35+
output_shape.insert(output_shape.end(),
36+
batch_shape.begin(),
37+
batch_shape.end());
38+
output_shape.insert(output_shape.end(),
39+
event_shape.begin(),
40+
event_shape.end());
41+
return output_shape;
42+
}
43+
44+
torch::Tensor Normal::log_prob(torch::Tensor value)
45+
{
46+
auto variance = scale.pow(2);
47+
auto log_scale = scale.log();
48+
return (-(value - loc).pow(2) /
49+
(2 * variance) -
50+
log_scale -
51+
std::log(std::sqrt(2 * M_PI)));
52+
}
53+
54+
torch::Tensor Normal::sample(c10::ArrayRef<int64_t> sample_shape)
55+
{
56+
auto shape = extended_shape(sample_shape);
57+
auto no_grad_guard = torch::NoGradGuard();
58+
return torch::normal(loc.expand(shape), scale.expand(shape));
59+
}
60+
61+
TEST_CASE("Normal")
62+
{
63+
float locs_array[] = {0, 1, 2, 3, 4, 5};
64+
float scales_array[] = {5, 4, 3, 2, 1, 0};
65+
auto locs = torch::from_blob(locs_array, {2, 3});
66+
auto scales = torch::from_blob(scales_array, {2, 3});
67+
auto dist = Normal(locs, scales);
68+
69+
SUBCASE("Sampled tensors have correct shape")
70+
{
71+
CHECK(dist.sample().sizes().vec() == std::vector<int64_t>{2, 3});
72+
CHECK(dist.sample({20}).sizes().vec() == std::vector<int64_t>{20, 2, 3});
73+
CHECK(dist.sample({2, 20}).sizes().vec() == std::vector<int64_t>{2, 20, 2, 3});
74+
CHECK(dist.sample({1, 2, 3, 4, 5}).sizes().vec() == std::vector<int64_t>{1, 2, 3, 4, 5, 2, 3});
75+
}
76+
77+
SUBCASE("entropy()")
78+
{
79+
auto entropies = dist.entropy();
80+
81+
SUBCASE("Returns correct values")
82+
{
83+
INFO("Entropies: \n"
84+
<< entropies);
85+
86+
CHECK(entropies[0].item().toDouble() ==
87+
doctest::Approx(8.3512).epsilon(1e-3));
88+
CHECK(entropies[1].item().toDouble() ==
89+
-std::numeric_limits<float>::infinity());
90+
}
91+
92+
SUBCASE("Output tensor is the correct size")
93+
{
94+
CHECK(entropies.sizes().vec() == std::vector<int64_t>{2});
95+
}
96+
}
97+
98+
SUBCASE("log_prob()")
99+
{
100+
float actions[2][3] = {{0, 1, 2},
101+
{0, 1, 2}};
102+
auto actions_tensor = torch::from_blob(actions, {2, 3});
103+
auto log_probs = dist.log_prob(actions_tensor);
104+
105+
INFO(log_probs << "\n");
106+
SUBCASE("Returns correct values")
107+
{
108+
CHECK(log_probs[0][0].item().toDouble() ==
109+
doctest::Approx(-2.5284).epsilon(1e-3));
110+
CHECK(log_probs[0][1].item().toDouble() ==
111+
doctest::Approx(-2.3052).epsilon(1e-3));
112+
CHECK(log_probs[0][2].item().toDouble() ==
113+
doctest::Approx(-2.0176).epsilon(1e-3));
114+
CHECK(log_probs[1][0].item().toDouble() ==
115+
doctest::Approx(-2.7371).epsilon(1e-3));
116+
CHECK(log_probs[1][1].item().toDouble() ==
117+
doctest::Approx(-5.4189).epsilon(1e-3));
118+
CHECK(std::isnan(log_probs[1][2].item().toDouble()));
119+
}
120+
121+
SUBCASE("Output tensor is correct size")
122+
{
123+
CHECK(log_probs.sizes().vec() == std::vector<int64_t>{2, 3});
124+
}
125+
}
126+
}
127+
}

src/model/nn_base.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ NNBase::NNBase(bool recurrent,
2525

2626
// Do not use.
2727
//
28-
// Instantiate a subclass and use their's instead
28+
// Instantiate a subclass and use theirs instead
2929
std::vector<torch::Tensor> NNBase::forward(torch::Tensor /*inputs*/,
3030
torch::Tensor /*hxs*/,
3131
torch::Tensor /*masks*/)

0 commit comments

Comments
 (0)