|
1 | | -from typing import List, Dict, Any |
2 | | - |
3 | | -from rlgym.api import RewardFunction, AgentID |
4 | | -from rlgym.rocket_league.api import GameState |
| 1 | +from rlgym_tools.rocket_league.renderers.rocketsimvis_renderer import RocketSimVisRenderer |
5 | 2 | import os |
6 | | -import json |
7 | | -import socket |
8 | | -from typing import Dict, Any |
9 | | - |
10 | 3 | import numpy as np |
11 | | -from rlgym.api import Renderer |
12 | | -from rlgym.rocket_league.api import GameState, Car |
13 | 4 |
|
14 | | -DEFAULT_UDP_IP = "127.0.0.1" |
15 | | -DEFAULT_UDP_PORT = 9273 # Default RocketSimVis port |
16 | 5 | project_name="ExampleBot" |
17 | 6 |
|
18 | | -BUTTON_NAMES = ("throttle", "steer", "pitch", "yaw", "roll", "jump", "boost", "handbrake") |
19 | | - |
20 | | - |
21 | | -class RocketSimVisRenderer(Renderer[GameState]): |
22 | | - """ |
23 | | - A renderer that sends game state information to RocketSimVis. |
24 | | -
|
25 | | - This is just the client side, you need to run RocketSimVis to see the visualization. |
26 | | - Code is here: https://github.com/ZealanL/RocketSimVis |
27 | | - """ |
28 | | - def __init__(self, udp_ip=DEFAULT_UDP_IP, udp_port=DEFAULT_UDP_PORT): |
29 | | - self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # UDP |
30 | | - self.udp_ip = udp_ip |
31 | | - self.udp_port = udp_port |
32 | | - |
33 | | - @staticmethod |
34 | | - def write_physobj(physobj): |
35 | | - j = { |
36 | | - 'pos': physobj.position.tolist(), |
37 | | - 'forward': physobj.forward.tolist(), |
38 | | - 'up': physobj.up.tolist(), |
39 | | - 'vel': physobj.linear_velocity.tolist(), |
40 | | - 'ang_vel': physobj.angular_velocity.tolist() |
41 | | - } |
42 | | - |
43 | | - return j |
44 | | - |
45 | | - @staticmethod |
46 | | - def write_car(car: Car, controls=None): |
47 | | - j = { |
48 | | - 'team_num': int(car.team_num), |
49 | | - 'phys': RocketSimVisRenderer.write_physobj(car.physics), |
50 | | - 'boost_amount': car.boost_amount, |
51 | | - 'on_ground': bool(car.on_ground), |
52 | | - "has_flipped_or_double_jumped": bool(car.has_flipped or car.has_double_jumped), |
53 | | - 'is_demoed': bool(car.is_demoed), |
54 | | - 'has_flip': bool(car.can_flip) |
55 | | - } |
56 | | - |
57 | | - if controls is not None: |
58 | | - if isinstance(controls, np.ndarray): |
59 | | - controls = { |
60 | | - k: float(v) |
61 | | - for k, v in zip(BUTTON_NAMES, controls) |
62 | | - } |
63 | | - j['controls'] = controls |
64 | | - |
65 | | - return j |
66 | | - |
67 | | - def render(self, state: GameState, shared_info: Dict[str, Any]) -> Any: |
68 | | - if "controls" in shared_info: |
69 | | - controls = shared_info["controls"] |
70 | | - else: |
71 | | - controls = {} |
72 | | - j = { |
73 | | - 'ball_phys': self.write_physobj(state.ball), |
74 | | - 'cars': [ |
75 | | - self.write_car(car, controls.get(agent_id)) |
76 | | - for agent_id, car in state.cars.items() |
77 | | - ], |
78 | | - 'boost_pad_states': (state.boost_pad_timers <= 0).tolist() |
79 | | - } |
80 | | - |
81 | | - self.sock.sendto(json.dumps(j).encode('utf-8'), (self.udp_ip, self.udp_port)) |
82 | | - |
83 | | - def close(self): |
84 | | - pass |
85 | | - |
86 | | -from typing import List, Dict, Any |
87 | | -from rlgym.api import RewardFunction, AgentID |
88 | | -from rlgym.rocket_league.api import GameState |
89 | | -from rlgym.rocket_league import common_values |
90 | | -import numpy as np |
91 | | - |
92 | | -from typing import Any, Dict, List |
93 | | -import numpy as np |
94 | | -from rlgym.rocket_league.common_values import BALL_MAX_SPEED |
95 | | - |
96 | | -class AdvancedTouchReward(RewardFunction[AgentID, GameState, float]): |
97 | | - def __init__(self, touch_reward: float = 0.0, acceleration_reward: float = 1, use_touch_count: bool = False): |
98 | | - self.touch_reward = touch_reward |
99 | | - self.acceleration_reward = acceleration_reward |
100 | | - self.use_touch_count = use_touch_count |
101 | | - |
102 | | - self.prev_ball = None |
103 | | - |
104 | | - def reset(self, agents: List[AgentID], initial_state: GameState, shared_info: Dict[str, Any]) -> None: |
105 | | - self.prev_ball = initial_state.ball |
106 | | - |
107 | | - def get_rewards(self, agents: List[AgentID], state: GameState, is_terminated: Dict[AgentID, bool], |
108 | | - is_truncated: Dict[AgentID, bool], shared_info: Dict[str, Any]) -> Dict[AgentID, float]: |
109 | | - rewards = {agent: 0 for agent in agents} |
110 | | - ball = state.ball |
111 | | - for agent in agents: |
112 | | - touches = state.cars[agent].ball_touches |
113 | | - |
114 | | - if touches > 0: |
115 | | - if not self.use_touch_count: |
116 | | - touches = 1 |
117 | | - acceleration = np.linalg.norm(ball.linear_velocity - self.prev_ball.linear_velocity) / BALL_MAX_SPEED |
118 | | - rewards[agent] += self.touch_reward * touches |
119 | | - rewards[agent] += acceleration * self.acceleration_reward |
120 | | - |
121 | | - self.prev_ball = ball |
122 | | - |
123 | | - return rewards |
124 | | - |
125 | | -class FaceBallReward(RewardFunction): |
126 | | - """Rewards the agent for facing the ball""" |
127 | | - def reset(self, agents: List[AgentID], initial_state: GameState, shared_info: Dict[str, Any]) -> None: |
128 | | - pass |
129 | | - |
130 | | - |
131 | | - def get_rewards(self, agents: List[AgentID], state: GameState, is_terminated: Dict[AgentID, bool], |
132 | | - is_truncated: Dict[AgentID, bool], shared_info: Dict[str, Any]) -> Dict[AgentID, float]: |
133 | | - rewards = {} |
134 | | - |
135 | | - for agent in agents: |
136 | | - car = state.cars[agent] |
137 | | - ball = state.ball |
138 | | - |
139 | | - car_pos = car.physics.position |
140 | | - ball_pos = ball.position |
141 | | - direction_to_ball = ball_pos - car_pos |
142 | | - norm = np.linalg.norm(direction_to_ball) |
143 | | - |
144 | | - if norm > 0: |
145 | | - direction_to_ball /= norm |
146 | | - |
147 | | - car_forward = car.physics.forward |
148 | | - dot_product = np.dot(car_forward, direction_to_ball) |
149 | | - |
150 | | - reward = dot_product # Dot product directly indicates alignment (-1 to 1) |
151 | | - rewards[agent] = reward |
152 | | - |
153 | | - return rewards |
154 | | - |
155 | | -class SpeedTowardBallReward(RewardFunction[AgentID, GameState, float]): |
156 | | - """Rewards the agent for moving quickly toward the ball""" |
157 | | - |
158 | | - def reset(self, agents: List[AgentID], initial_state: GameState, shared_info: Dict[str, Any]) -> None: |
159 | | - pass |
160 | | - |
161 | | - def get_rewards(self, agents: List[AgentID], state: GameState, is_terminated: Dict[AgentID, bool], |
162 | | - is_truncated: Dict[AgentID, bool], shared_info: Dict[str, Any]) -> Dict[AgentID, float]: |
163 | | - rewards = {} |
164 | | - for agent in agents: |
165 | | - car = state.cars[agent] |
166 | | - car_physics = car.physics if car.is_orange else car.inverted_physics |
167 | | - ball_physics = state.ball if car.is_orange else state.inverted_ball |
168 | | - player_vel = car_physics.linear_velocity |
169 | | - pos_diff = (ball_physics.position - car_physics.position) |
170 | | - dist_to_ball = np.linalg.norm(pos_diff) |
171 | | - dir_to_ball = pos_diff / dist_to_ball |
172 | | - |
173 | | - speed_toward_ball = np.dot(player_vel, dir_to_ball) |
174 | | - |
175 | | - rewards[agent] = max(speed_toward_ball / common_values.CAR_MAX_SPEED, 0.0) |
176 | | - return rewards |
177 | | - |
178 | | -class InAirReward(RewardFunction[AgentID, GameState, float]): |
179 | | - """Rewards the agent for being in the air""" |
180 | | - |
181 | | - def reset(self, agents: List[AgentID], initial_state: GameState, shared_info: Dict[str, Any]) -> None: |
182 | | - pass |
183 | | - |
184 | | - def get_rewards(self, agents: List[AgentID], state: GameState, is_terminated: Dict[AgentID, bool], |
185 | | - is_truncated: Dict[AgentID, bool], shared_info: Dict[str, Any]) -> Dict[AgentID, float]: |
186 | | - return {agent: float(not state.cars[agent].on_ground) for agent in agents} |
187 | | - |
188 | | -class VelocityBallToGoalReward(RewardFunction[AgentID, GameState, float]): |
189 | | - """Rewards the agent for hitting the ball toward the opponent's goal""" |
190 | | - |
191 | | - def reset(self, agents: List[AgentID], initial_state: GameState, shared_info: Dict[str, Any]) -> None: |
192 | | - pass |
193 | | - |
194 | | - def get_rewards(self, agents: List[AgentID], state: GameState, is_terminated: Dict[AgentID, bool], |
195 | | - is_truncated: Dict[AgentID, bool], shared_info: Dict[str, Any]) -> Dict[AgentID, float]: |
196 | | - rewards = {} |
197 | | - for agent in agents: |
198 | | - car = state.cars[agent] |
199 | | - ball = state.ball |
200 | | - if car.is_orange: |
201 | | - goal_y = -common_values.BACK_NET_Y |
202 | | - else: |
203 | | - goal_y = common_values.BACK_NET_Y |
204 | | - |
205 | | - ball_vel = ball.linear_velocity |
206 | | - pos_diff = np.array([0, goal_y, 0]) - ball.position |
207 | | - dist = np.linalg.norm(pos_diff) |
208 | | - dir_to_goal = pos_diff / dist |
209 | | - |
210 | | - vel_toward_goal = np.dot(ball_vel, dir_to_goal) |
211 | | - rewards[agent] = max(vel_toward_goal / common_values.BALL_MAX_SPEED, 0) |
212 | | - return rewards |
213 | | - |
214 | | - |
215 | | -class TouchReward(RewardFunction[AgentID, GameState, float]): |
216 | | - """ |
217 | | - A RewardFunction that gives a reward of 1 if the agent touches the ball, 0 otherwise. |
218 | | - """ |
219 | | - |
220 | | - def reset(self, agents: List[AgentID], initial_state: GameState, shared_info: Dict[str, Any]) -> None: |
221 | | - pass |
222 | | - |
223 | | - def get_rewards(self, agents: List[AgentID], state: GameState, is_terminated: Dict[AgentID, bool], |
224 | | - is_truncated: Dict[AgentID, bool], shared_info: Dict[str, Any]) -> Dict[AgentID, float]: |
225 | | - return {agent: self._get_reward(agent, state) for agent in agents} |
226 | | - |
227 | | - def _get_reward(self, agent: AgentID, state: GameState) -> float: |
228 | | - return 1. if state.cars[agent].ball_touches > 0 else 0. |
229 | | - |
230 | | - |
231 | | - |
232 | 7 | def build_rlgym_v2_env(): |
233 | 8 | import numpy as np |
234 | 9 | from rlgym.api import RLGym |
@@ -258,9 +33,9 @@ def build_rlgym_v2_env(): |
258 | 33 |
|
259 | 34 | reward_fn = CombinedReward( |
260 | 35 | (InAirReward(), 0.15), |
261 | | - (SpeedTowardBallReward(), 5), |
262 | | - (VelocityBallToGoalReward(), 10), |
263 | | - (TouchReward(), 50), |
| 36 | + (SpeedTowardBallReward(), 5.0), |
| 37 | + (VelocityBallToGoalReward(), 10.0), |
| 38 | + (TouchReward(), 50.0), |
264 | 39 | (SpeedTowardBallReward(), 5.0), |
265 | 40 | (FaceBallReward(), 1.0), |
266 | 41 | (VelocityBallToGoalReward(), 10.0), |
@@ -315,27 +90,27 @@ def build_rlgym_v2_env(): |
315 | 90 | learner = Learner(build_rlgym_v2_env, |
316 | 91 | n_proc=n_proc, |
317 | 92 | min_inference_size=min_inference_size, |
318 | | - metrics_logger=None, # Leave this empty for now. |
| 93 | + metrics_logger=None, # Leave this empty, if you provide something here this is what will give you concrete game information from training, depending on what you add. |
319 | 94 | ppo_batch_size=100_000, # batch size - much higher than 300K doesn't seem to help most people |
320 | | - policy_layer_sizes=[512, 512, 512], # policy network |
321 | | - critic_layer_sizes=[512, 512, 512], # critic network |
| 95 | + policy_layer_sizes=[512, 512, 512], # policy network layer sizes |
| 96 | + critic_layer_sizes=[512, 512, 512], # critic network layer sizes |
322 | 97 | ts_per_iteration=100_000, # timesteps per training iteration - set this equal to the batch size |
323 | 98 | exp_buffer_size=300_000, # size of experience buffer - keep this 2 - 3x the batch size |
324 | 99 | ppo_minibatch_size=50_000, # minibatch size - set this as high as your GPU can handle |
325 | 100 | ppo_ent_coef=0.01, |
326 | 101 | render=True, |
327 | | - render_delay=0.047, |
| 102 | + render_delay=0, # to create this you should define constants TICK_SKIP and TICK_RATE (120) and create a fraction that tells you how long one step is in one second. |
328 | 103 | add_unix_timestamp=False, |
329 | 104 | checkpoint_load_folder=checkpoint_load_folder, |
330 | 105 | checkpoints_save_folder=checkpoint_folder, # entropy coefficient - this determines the impact of exploration |
331 | | - policy_lr=2e-4, # policy learning rate |
| 106 | + policy_lr=2e-4, # policy learning rate, ensure this matches the critic learning rate. |
332 | 107 | device="auto", #device to use |
333 | | - critic_lr=2e-4, # critic learning rate |
| 108 | + critic_lr=2e-4, # critic learning rate, keep the same as your policy's learning rate |
334 | 109 | ppo_epochs=2, # number of PPO epochs |
335 | 110 | standardize_returns=True, # Don't touch these. |
336 | 111 | standardize_obs=False, # Don't touch these. |
337 | 112 | save_every_ts=10_000_000, # save every 1M steps |
338 | 113 | timestep_limit=50_000_000_000, # Train for 1B steps |
339 | | - log_to_wandb=False # Set this to True if you want to use Weights & Biases for logging. |
| 114 | + log_to_wandb=False # Set this to True if you want to use Weights & Biases for logging, Weights & Biases is generally optimal and the most used option. |
340 | 115 | ) |
341 | 116 | learner.learn() |
0 commit comments