Skip to content

Commit d4af755

Browse files
author
apbose
committed
Tabular Methods
1 parent ee0c46f commit d4af755

File tree

1 file changed

+353
-0
lines changed

1 file changed

+353
-0
lines changed

TabularMethods/tabularmethod.py

Lines changed: 353 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,353 @@
1+
import gym
2+
import pybulletgym
3+
import pybulletgym.envs
4+
import numpy as np
5+
import math
6+
import matplotlib.pyplot as plt
7+
from numpy.linalg import pinv
8+
import time
9+
10+
env = gym.make("FrozenLake-v0")
11+
env.reset()
12+
env.render()
13+
14+
"""
15+
Action space:
16+
LEFT = 0
17+
DOWN = 1
18+
RIGHT = 2
19+
UP = 3
20+
"""
21+
state = env.reset();
22+
23+
24+
##TestPolicy
25+
def TestPolicy (environment, policy, trials = 100):
26+
"""
27+
Evaluate the policy
28+
:param environment: the environment
29+
:param policy: the input policy
30+
:param trials: the no of trials
31+
"""
32+
success = 0;
33+
rewards = 0;
34+
for i in range(trials):
35+
terminated = False;
36+
state = environment.reset();
37+
#print("state", i, state)
38+
while not terminated:
39+
action = policy[state];
40+
next_state, reward, terminated, info = environment.step(action);
41+
rewards = rewards + reward;
42+
state = next_state;
43+
if (terminated and reward == 1):
44+
success = success + 1;
45+
av_reward = rewards/trials;
46+
success = success/trials;
47+
return (av_reward,success)
48+
49+
#taking a random deterministic policy
50+
policy_rand = np.random.randint(4, size=env.nS)
51+
52+
#assigning the policy given in the question
53+
policy_ques = np.zeros(env.nS)
54+
states = np.arange(0,16)
55+
policy_ques[states] = np.mod(states+1, 4)
56+
print (policy_ques)
57+
58+
output = TestPolicy (env, policy_ques)
59+
print(output)
60+
61+
#learn model
62+
def LearnModel (environment, samples = 100000):
63+
"""
64+
Learn the transition probablities (P(s'|a,s)) and the R(s',a, s)
65+
:param environment: environment
66+
:param currentState: state for which the table to be determined
67+
:param samples: no of samples
68+
"""
69+
init_state = environment.reset()
70+
dict_action_nextState = {}
71+
dict_reward = {}
72+
num_visited = {}
73+
for states in range(16):
74+
dict_action_nextState[states] = np.zeros(shape = (env.nA, env.nS))
75+
dict_reward[states] = np.zeros(shape = (env.nA, env.nS))
76+
#num_visited[states] = np.zeros(shape = (env.nA, env.nS))
77+
78+
for iters in range(samples):
79+
#for i in range(0,4):
80+
action = np.random.randint(4)
81+
next_state, reward, terminated, info = environment.step(action);
82+
dict_action_nextState[init_state][action][next_state] += 1
83+
dict_reward[init_state][action][next_state] += reward
84+
init_state = next_state
85+
#num_visited[init_state][action][:] += 1
86+
if (terminated):
87+
#take just one more action so that the initial state is the terminated state
88+
act = np.random.randint(4)
89+
next_state, reward, terminated, info = environment.step(act);
90+
dict_action_nextState[init_state][act][next_state] += 1
91+
init_state = environment.reset();
92+
93+
94+
95+
96+
for states in range(16):
97+
98+
for row in range (dict_action_nextState[states].shape[0]):
99+
dict_action_nextState[states][row] = dict_action_nextState[states][row]/np.sum(dict_action_nextState[states][row])
100+
dict_reward[states] = dict_reward[states]/np.sum(dict_action_nextState[states][row])
101+
102+
return dict_action_nextState, dict_reward
103+
104+
105+
dict_action_nextState, dict_reward = LearnModel (env,20000)
106+
print("========================ACTION_NEXTSTATE==============================")
107+
print(dict_action_nextState)
108+
print("========================REWARD========================================")
109+
print(dict_reward)
110+
111+
def Policy_Evaluation(environment, policy, discount_factor = 1, theta = 1e-9, max_iterations = 1e9):
112+
V = np.zeros(environment.nS)
113+
evaluate_iter = 0
114+
for i in range(int(max_iterations)):
115+
delta = 0
116+
evaluate_iter += 1
117+
for state in range(environment.nS):
118+
v = 0
119+
for next_state in range(environment.nS):
120+
v += dict_action_nextState[state][int(policy[state])][next_state]*(dict_reward[state][int(policy[state])][next_state] + discount_factor * V[next_state])
121+
122+
#calculate the delta change of value function
123+
delta = max(delta, np.abs(V[state] - v))
124+
#update the value function
125+
V[state] = v
126+
127+
# Terminate if value change is insignificant
128+
if delta < theta:
129+
#print(f'Policy evaluated in {evaluate_iter} iterations.')
130+
return V
131+
132+
print(delta)
133+
return V
134+
135+
def Lookahead(environment, state, V, discount_factor):
136+
action_values = np.zeros(environment.nA)
137+
for action in range(environment.nA):
138+
for next_state in range(environment.nS):
139+
action_values[action] += dict_action_nextState[state][action][next_state] * (dict_reward[state][action][next_state] + discount_factor * V[next_state])
140+
return action_values
141+
142+
def policy_iteration(environment, policy, discount_factor=1.0, max_iterations=50):
143+
iters = []
144+
evaluation = []
145+
evaluate_iter = 0
146+
flag = 0
147+
for i in range(max_iterations):
148+
#print(f'Policy in {i} iter. is {policy}')
149+
stable_policy = True
150+
evaluate_iter+= 1
151+
V = Policy_Evaluation(environment, policy, discount_factor = discount_factor)
152+
#Go through each state and try to improve the action taken
153+
for state in range(environment.nS):
154+
curr_action = policy[state]
155+
#now evaluate every other action
156+
action_values = Lookahead(environment, state, V, discount_factor);
157+
# a better action
158+
best_action = np.argmax(action_values)
159+
#greedy update
160+
policy[state] = best_action
161+
if(best_action != curr_action):
162+
stable_policy = False #making the stable false if any action changes for the state
163+
eval = TestPolicy(environment, policy, trials = 100)
164+
165+
iters.append(i)
166+
evaluation.append(eval[1])
167+
if (stable_policy and flag == 0) :
168+
print(f'Policy converged in {evaluate_iter} iterations.')
169+
flag = 1
170+
#plt.plot(iters,evaluation, color='b');
171+
#plt.show()
172+
#return policy
173+
174+
plt.plot(iters,evaluation, color='b');
175+
plt.xlabel("iterations")
176+
plt.ylabel("test_policy")
177+
plt.show()
178+
return policy
179+
180+
policy_ques = np.zeros(env.nS)
181+
states = np.arange(0,16)
182+
policy_ques[states] = np.mod(states+1, 4)
183+
Policy = policy_iteration(env, policy_ques)
184+
print(Policy)
185+
186+
#Start with random policies
187+
policy_rand1 = np.random.randint(4, size=env.nS)
188+
Policy = policy_iteration(env, policy_rand1)
189+
print(Policy)
190+
191+
policy_rand2 = np.random.randint(4, size=env.nS)
192+
Policy = policy_iteration(env, policy_rand2)
193+
print(Policy)
194+
195+
def Value_iteration(environment, discount_factor = 1.0, theta = 1e-9, max_iterations=50):
196+
V = np.zeros(environment.nS)
197+
policy = np.zeros(environment.nS)
198+
evaluate_iter = 0
199+
iters = []
200+
evaluation = []
201+
for i in range(max_iterations):
202+
evaluate_iter+= 1
203+
delta = 0
204+
for state in range(environment.nS):
205+
action_value = Lookahead(environment, state, V, discount_factor)
206+
best_action_value = np.max(action_value)
207+
best_action = np.argmax(action_value)
208+
delta = max(delta, np.abs(V[state] - best_action_value))
209+
V[state] = best_action_value
210+
policy[state] = best_action
211+
eval = TestPolicy(environment, policy, trials = 100);
212+
iters.append(i)
213+
evaluation.append(eval[1])
214+
215+
if(delta < theta):
216+
print(f'Value converged in {evaluate_iter} iterations.')
217+
plt.plot(iters,evaluation, color='b');
218+
plt.show()
219+
return policy
220+
221+
plt.plot(iters,evaluation, color='b');
222+
plt.xlabel("iterations")
223+
plt.ylabel("test_policy")
224+
plt.show()
225+
return policy
226+
227+
policy = Value_iteration(env)
228+
print(policy)
229+
230+
def choose_action(state, epsilon, Q):
231+
action=0
232+
if np.random.uniform(0, 1) < epsilon:
233+
action = env.action_space.sample()
234+
else:
235+
action = np.argmax(Q[state, :])
236+
return action
237+
238+
239+
def Q_learning(environment, gamma = 0.99, alpha = 0.05, total_episodes = 5000, max_steps = 50):
240+
Q = np.zeros((environment.nS, environment.nA))
241+
policy = np.zeros(environment.nS)
242+
episodes = []
243+
evaluations = []
244+
for episode in range(total_episodes):
245+
#print(episode)
246+
state = environment.reset()
247+
t = 0
248+
249+
while t < max_steps:
250+
251+
action = choose_action(state, 1 - episode/5000, Q)
252+
next_state, reward, done, info = environment.step(action)
253+
predict = Q[state,action]
254+
target = reward + gamma * np.max(Q[next_state,:])
255+
Q[state,action] = Q[state, action] + alpha * (target - predict)
256+
state = next_state
257+
258+
#start with a new episode
259+
if done:
260+
#determine the policy
261+
policy = np.argmax(Q, axis = 1)
262+
evaluation = TestPolicy(environment, policy, trials = 100);
263+
264+
break
265+
266+
267+
t+= 1
268+
269+
policy = np.argmax(Q, axis = 1)
270+
if(episode%100 == 0):
271+
evaluation = TestPolicy(environment, policy, trials = 100);
272+
episodes.append(episode)
273+
evaluations.append(evaluation[1])
274+
275+
276+
plt.plot(episodes,evaluations, color='b');
277+
title = "model: alpha ="+ str(alpha) + "gamma ="+ str(gamma);
278+
279+
plt.title(title)
280+
plt.xlabel("episodes")
281+
plt.ylabel("test_policy")
282+
plt.show()
283+
return Q
284+
285+
gamma_list = [0.90, 0.95, 0.99]
286+
alpha_list = [0.05, 0.1, 0.25, 0.5]
287+
288+
for i in range(0,len(alpha_list)):
289+
Qans = Q_learning(env, gamma = 0.99, alpha = alpha_list[i])
290+
policy = np.argmax(Qans, axis = 1 )
291+
print(f'The policy for alpha value {alpha_list[i]} and gamma value 0.99 is {policy}.')
292+
293+
294+
for i in range(0,len(gamma_list)):
295+
Qans = Q_learning(env, gamma = gamma_list[i], alpha = 0.05)
296+
policy = np.argmax(Qans, axis = 1)
297+
print(f'The policy for alpha value 0.05 and gamma value {gamma_list[i]} is {policy}.')
298+
299+
def Q_learning_opt(environment, gamma = 0.99, alpha = 0.05, explore = 1, total_episodes = 5000, max_steps = 50):
300+
Q = np.zeros((environment.nS, environment.nA))
301+
policy = np.zeros(environment.nS)
302+
episodes = []
303+
evaluations = []
304+
for episode in range(total_episodes):
305+
#print(episode)
306+
state = environment.reset()
307+
t = 0
308+
309+
while t < max_steps:
310+
action = choose_action(state, explore, Q)
311+
next_state, reward, done, info = environment.step(action)
312+
predict = Q[state,action]
313+
target = reward + gamma * np.max(Q[next_state,:])
314+
Q[state,action] = Q[state, action] + alpha * (target - predict)
315+
#print(Q)
316+
state = next_state
317+
318+
#start with a new episode
319+
if done:
320+
#determine the policy
321+
policy = np.argmax(Q, axis = 1)
322+
evaluation = TestPolicy(environment, policy, trials = 100);
323+
break
324+
325+
#time.sleep(0.1)
326+
t+= 1
327+
#print(f'Value t is {t}')
328+
policy = np.argmax(Q, axis = 1)
329+
if(episode%100 == 0):
330+
evaluation = TestPolicy(environment, policy, trials = 100);
331+
episodes.append(episode)
332+
evaluations.append(evaluation[1])
333+
334+
plt.plot(episodes,evaluations, color='b');
335+
title = "model: alpha ="+ str(alpha) + "gamma ="+ str(gamma);
336+
337+
plt.title(title)
338+
plt.xlabel("episodes")
339+
plt.ylabel("test_policy")
340+
plt.show()
341+
return Q
342+
343+
Qans = Q_learning_opt(env, gamma = 0.99, alpha = 0.05)
344+
policy = np.argmax(Qans, axis = 1)
345+
print(f'The policy for alpha value 0.05 and gamma value 0.99 is {policy}.')
346+
347+
Qans = Q_learning_opt(env, gamma = 0.99, alpha = 0.05, explore = 0.9)
348+
policy = np.argmax(Qans, axis = 1)
349+
print(f'The policy for alpha value 0.05 and gamma value 0.99 is {policy}.')
350+
351+
Qans = Q_learning_opt(env, gamma = 0.99, alpha = 0.05, explore = 0.5)
352+
policy = np.argmax(Qans, axis = 1)
353+
print(f'The policy for alpha value 0.05 and gamma value 0.99 is {policy}.')

0 commit comments

Comments
 (0)