-
Notifications
You must be signed in to change notification settings - Fork 0
/
agent.py
167 lines (142 loc) · 7.1 KB
/
agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import random
from collections import deque # Double-ended queue for replay memory
import numpy as np
import torch
from environment import Direction, Point
from model import Linear_QNet, QTrainer
# Define constants for replay memory and training
MAX_MEMORY = 100_000 # Maximum size of memory buffer
BATCH_SIZE = 1000 # Batch size for training
LR = 0.001 # Learning rate for the optimizer
class Agent:
"""
This class implements the agent responsible for learning to play the Snake game using a Replay Q-Network (RQN).
"""
# Initialize the agent
def __init__(self):
"""
Initializes the agent's core components:
- n_games: Tracks the number of games played.
- epsilon: Controls exploration (random actions) vs. exploitation (choosing the best-known action).
- gamma: The discount factor that prioritizes immediate rewards over long-term rewards.
- memory: Stores experiences (state, action, reward, next state) in a deque for experience replay.
- The model is the neural network that predicts Q-values for actions, and the trainer handles updating the
model's weights using Q-learning.
"""
self.n_games = 0 # Counter for number of games played
self.epsilon = 0 # Exploration rate (starts with high exploration)
self.gamma = 0.9 # Discount rate (between 0 and 1)
self.memory = deque(maxlen=MAX_MEMORY) # Memory buffer for experience replay
self.model = Linear_QNet(11, 256, 3) # Neural network with 11 inputs, 256 hidden units, 3 outputs (actions)
self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma)
def get_state(self, game):
"""
This method extracts the current state of the game (such as the position of the snake, food, and potential
dangers) and encodes it into a numerical format (a list of features). The agent uses this information to decide
the next action.
:param game:
:return:
"""
# Get the state of the game, including positions of the snake and food
head = game.snake[0] # Snake's head position
point_l = Point(head.x - 20, head.y) # Left point
point_r = Point(head.x + 20, head.y) # Right point
point_u = Point(head.x, head.y - 20) # Up point
point_d = Point(head.x, head.y + 20) # Down point
# Get the direction the snake is moving in
dir_l = game.direction == Direction.LEFT
dir_r = game.direction == Direction.RIGHT
dir_u = game.direction == Direction.UP
dir_d = game.direction == Direction.DOWN
# Create the state array: 11 features (danger, direction, food location)
state = [
# Danger straight
(dir_r and game.is_collision(point_r)) or
(dir_l and game.is_collision(point_l)) or
(dir_u and game.is_collision(point_u)) or
(dir_d and game.is_collision(point_d)),
# Danger right
(dir_u and game.is_collision(point_r)) or
(dir_d and game.is_collision(point_l)) or
(dir_l and game.is_collision(point_u)) or
(dir_r and game.is_collision(point_d)),
# Danger left
(dir_d and game.is_collision(point_r)) or
(dir_u and game.is_collision(point_l)) or
(dir_r and game.is_collision(point_u)) or
(dir_l and game.is_collision(point_d)),
# Current direction of the snake
dir_l,
dir_r,
dir_u,
dir_d,
# Location of the food relative to the snake's head
game.food.x < game.head.x, # Food is left
game.food.x > game.head.x, # Food is right
game.food.y < game.head.y, # Food is above
game.food.y > game.head.y # Food is below
]
return np.array(state, dtype=int) # Return the state as a numpy array
def remember(self, state, action, reward, next_state, done):
"""
Stores the experiences (state, action, reward, next state) in memory for replay. This helps the agent learn
from past-experiences.
:param state:
:param action:
:param reward:
:param next_state:
:param done:
:return:
"""
# Store the experience in memory (state, action, reward, next state, done flag)
self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached
def train_long_memory(self):
"""
This method trains the agent using experience replay. If enough experiences have been stored in memory, it
samples a batch and trains the model on it. Otherwise, it trains on all the stored experiences.
:return:
"""
# Train on a random batch of experience from memory
if len(self.memory) > BATCH_SIZE:
mini_sample = random.sample(self.memory, BATCH_SIZE) # Random sample from memory
else:
mini_sample = self.memory # Use all memory if not enough samples
# Extract state, action, reward, next state, and done flag from samples
states, actions, rewards, next_states, dones = zip(*mini_sample)
self.trainer.train_step(states, actions, rewards, next_states, dones) # Train
# for state, action, reward, next_state, done in mini_sample:
# self.trainer.train_step(state, action, reward, next_state, done)
def train_short_memory(self, state, action, reward, next_state, done):
"""
This method trains the agent immediately after each step, using the most recent experience. It updates the model
based on the current state, action taken, reward received, and the next state.
:param state:
:param action:
:param reward:
:param next_state:
:param done:
:return:
"""
# Train on the most recent experience
self.trainer.train_step(state, action, reward, next_state, done)
def get_action(self, state):
"""
This method determines the action the agent will take. It either chooses a random action (exploration) or
selects the best action based on the current state (exploitation) by predicting Q-values using the neural
network. As the agent plays more games, it explores less and exploits more.
:param state:
:return:
"""
# Choose an action (exploration vs. exploitation)
# random moves: tradeoff exploration / exploitation
self.epsilon = 80 - self.n_games # Decrease exploration as more games are played
final_move = [0, 0, 0] # Action representation: [straight, right, left]
if random.randint(0, 200) < self.epsilon:
move = random.randint(0, 2) # Random action (exploration)
final_move[move] = 1
else:
state0 = torch.tensor(state, dtype=torch.float) # Convert state to tensor
prediction = self.model(state0) # Predict action using the neural network
move = torch.argmax(prediction).item() # Choose action with the highest Q-value
final_move[move] = 1
return final_move