Skip to content

Commit 0e97857

Browse files
committed
update
1 parent 3231880 commit 0e97857

File tree

3 files changed

+404
-0
lines changed

3 files changed

+404
-0
lines changed

rl/extra_reading.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
Hacking Google reCAPTCHA v3 using Reinforcement Learning
22
https://arxiv.org/pdf/1903.01003.pdf
33

4+
Practical Deep Reinforcement Learning Approach for Stock Trading
5+
https://arxiv.org/abs/1811.07522
6+
47
Reinforcement Learning: A Tutorial Survey and Recent Advances - Abhijit Gosavi
58
http://web.mst.edu/~gosavia/joc.pdf
69

rl/linear_rl_trader.py

Lines changed: 385 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,385 @@
1+
import numpy as np
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
5+
from datetime import datetime
6+
import itertools
7+
import argparse
8+
import re
9+
import os
10+
import pickle
11+
12+
from sklearn.preprocessing import StandardScaler
13+
14+
15+
# Let's use AAPL (Apple), MSI (Motorola), SBUX (Starbucks)
16+
def get_data():
17+
# returns a T x 3 list of stock prices
18+
# each row is a different stock
19+
# 0 = AAPL
20+
# 1 = MSI
21+
# 2 = SBUX
22+
df = pd.read_csv('../tf2.0/aapl_msi_sbux.csv')
23+
return df.values
24+
25+
26+
27+
28+
29+
def get_scaler(env):
30+
# return scikit-learn scaler object to scale the states
31+
# Note: you could also populate the replay buffer here
32+
33+
states = []
34+
for _ in range(env.n_step):
35+
action = np.random.choice(env.action_space)
36+
state, reward, done, info = env.step(action)
37+
states.append(state)
38+
if done:
39+
break
40+
41+
scaler = StandardScaler()
42+
scaler.fit(states)
43+
return scaler
44+
45+
46+
47+
48+
def maybe_make_dir(directory):
49+
if not os.path.exists(directory):
50+
os.makedirs(directory)
51+
52+
53+
54+
class LinearModel:
55+
""" A linear regression model """
56+
def __init__(self, input_dim, n_action):
57+
self.W = np.random.randn(input_dim, n_action) / np.sqrt(input_dim)
58+
self.b = np.zeros(n_action)
59+
60+
# momentum terms
61+
self.vW = 0
62+
self.vb = 0
63+
64+
self.losses = []
65+
66+
def predict(self, X):
67+
# make sure X is N x D
68+
assert(len(X.shape) == 2)
69+
return X.dot(self.W) + self.b
70+
71+
def sgd(self, X, Y, learning_rate=0.01, momentum=0.9):
72+
# make sure X is N x D
73+
assert(len(X.shape) == 2)
74+
75+
# the loss values are 2-D
76+
# normally we would divide by N only
77+
# but now we divide by N x K
78+
num_values = np.prod(Y.shape)
79+
80+
# do one step of gradient descent
81+
# we multiply by 2 to get the exact gradient
82+
# (not adjusting the learning rate)
83+
# i.e. d/dx (x^2) --> 2x
84+
Yhat = self.predict(X)
85+
gW = 2 * X.T.dot(Yhat - Y) / num_values
86+
gb = 2 * (Yhat - Y).sum(axis=0) / num_values
87+
88+
# update momentum terms
89+
self.vW = momentum * self.vW - learning_rate * gW
90+
self.vb = momentum * self.vb - learning_rate * gb
91+
92+
# update params
93+
self.W += self.vW
94+
self.b += self.vb
95+
96+
mse = np.mean((Yhat - Y)**2)
97+
self.losses.append(mse)
98+
99+
def load_weights(self, filepath):
100+
npz = np.load(filepath)
101+
self.W = npz['W']
102+
self.b = npz['b']
103+
104+
def save_weights(self, filepath):
105+
np.savez(filepath, W=self.W, b=self.b)
106+
107+
108+
109+
110+
class MultiStockEnv:
111+
"""
112+
A 3-stock trading environment.
113+
State: vector of size 7 (n_stock * 2 + 1)
114+
- # shares of stock 1 owned
115+
- # shares of stock 2 owned
116+
- # shares of stock 3 owned
117+
- price of stock 1 (using daily close price)
118+
- price of stock 2
119+
- price of stock 3
120+
- cash owned (can be used to purchase more stocks)
121+
Action: categorical variable with 27 (3^3) possibilities
122+
- for each stock, you can:
123+
- 0 = sell
124+
- 1 = hold
125+
- 2 = buy
126+
"""
127+
def __init__(self, data, initial_investment=20000):
128+
# data
129+
self.stock_price_history = data
130+
self.n_step, self.n_stock = self.stock_price_history.shape
131+
132+
# instance attributes
133+
self.initial_investment = initial_investment
134+
self.cur_step = None
135+
self.stock_owned = None
136+
self.stock_price = None
137+
self.cash_in_hand = None
138+
139+
self.action_space = np.arange(3**self.n_stock)
140+
141+
# action permutations
142+
# returns a nested list with elements like:
143+
# [0,0,0]
144+
# [0,0,1]
145+
# [0,0,2]
146+
# [0,1,0]
147+
# [0,1,1]
148+
# etc.
149+
# 0 = sell
150+
# 1 = hold
151+
# 2 = buy
152+
self.action_list = list(map(list, itertools.product([0, 1, 2], repeat=self.n_stock)))
153+
154+
# calculate size of state
155+
self.state_dim = self.n_stock * 2 + 1
156+
157+
self.reset()
158+
159+
160+
def reset(self):
161+
self.cur_step = 0
162+
self.stock_owned = np.zeros(self.n_stock)
163+
self.stock_price = self.stock_price_history[self.cur_step]
164+
self.cash_in_hand = self.initial_investment
165+
return self._get_obs()
166+
167+
168+
def step(self, action):
169+
assert action in self.action_space
170+
171+
# get current value before performing the action
172+
prev_val = self._get_val()
173+
174+
# update price, i.e. go to the next day
175+
self.cur_step += 1
176+
self.stock_price = self.stock_price_history[self.cur_step]
177+
178+
# perform the trade
179+
self._trade(action)
180+
181+
# get the new value after taking the action
182+
cur_val = self._get_val()
183+
184+
# reward is the increase in porfolio value
185+
reward = cur_val - prev_val
186+
187+
# done if we have run out of data
188+
done = self.cur_step == self.n_step - 1
189+
190+
# store the current value of the portfolio here
191+
info = {'cur_val': cur_val}
192+
193+
# conform to the Gym API
194+
return self._get_obs(), reward, done, info
195+
196+
197+
def _get_obs(self):
198+
obs = np.empty(self.state_dim)
199+
obs[:self.n_stock] = self.stock_owned
200+
obs[self.n_stock:2*self.n_stock] = self.stock_price
201+
obs[-1] = self.cash_in_hand
202+
return obs
203+
204+
205+
206+
def _get_val(self):
207+
return self.stock_owned.dot(self.stock_price) + self.cash_in_hand
208+
209+
210+
def _trade(self, action):
211+
# index the action we want to perform
212+
# 0 = sell
213+
# 1 = hold
214+
# 2 = buy
215+
# e.g. [2,1,0] means:
216+
# buy first stock
217+
# hold second stock
218+
# sell third stock
219+
action_vec = self.action_list[action]
220+
221+
# determine which stocks to buy or sell
222+
sell_index = [] # stores index of stocks we want to sell
223+
buy_index = [] # stores index of stocks we want to buy
224+
for i, a in enumerate(action_vec):
225+
if a == 0:
226+
sell_index.append(i)
227+
elif a == 2:
228+
buy_index.append(i)
229+
230+
# sell any stocks we want to sell
231+
# then buy any stocks we want to buy
232+
if sell_index:
233+
# NOTE: to simplify the problem, when we sell, we will sell ALL shares of that stock
234+
for i in sell_index:
235+
self.cash_in_hand += self.stock_price[i] * self.stock_owned[i]
236+
self.stock_owned[i] = 0
237+
if buy_index:
238+
# NOTE: when buying, we will loop through each stock we want to buy,
239+
# and buy one share at a time until we run out of cash
240+
can_buy = True
241+
while can_buy:
242+
for i in buy_index:
243+
if self.cash_in_hand > self.stock_price[i]:
244+
self.stock_owned[i] += 1 # buy one share
245+
self.cash_in_hand -= self.stock_price[i]
246+
else:
247+
can_buy = False
248+
249+
250+
251+
252+
253+
class DQNAgent(object):
254+
def __init__(self, state_size, action_size):
255+
self.state_size = state_size
256+
self.action_size = action_size
257+
self.gamma = 0.95 # discount rate
258+
self.epsilon = 1.0 # exploration rate
259+
self.epsilon_min = 0.01
260+
self.epsilon_decay = 0.995
261+
self.model = LinearModel(state_size, action_size)
262+
263+
def act(self, state):
264+
if np.random.rand() <= self.epsilon:
265+
return np.random.choice(self.action_size)
266+
act_values = self.model.predict(state)
267+
return np.argmax(act_values[0]) # returns action
268+
269+
270+
def train(self, state, action, reward, next_state, done):
271+
if done:
272+
target = reward
273+
else:
274+
target = reward + self.gamma * np.amax(self.model.predict(next_state), axis=1)
275+
276+
target_full = self.model.predict(state)
277+
target_full[0, action] = target
278+
279+
# Run one training step
280+
self.model.sgd(state, target_full)
281+
282+
if self.epsilon > self.epsilon_min:
283+
self.epsilon *= self.epsilon_decay
284+
285+
286+
def load(self, name):
287+
self.model.load_weights(name)
288+
289+
290+
def save(self, name):
291+
self.model.save_weights(name)
292+
293+
294+
def play_one_episode(agent, env, is_train):
295+
# note: after transforming states are already 1xD
296+
state = env.reset()
297+
state = scaler.transform([state])
298+
done = False
299+
300+
while not done:
301+
action = agent.act(state)
302+
next_state, reward, done, info = env.step(action)
303+
next_state = scaler.transform([next_state])
304+
if is_train == 'train':
305+
agent.train(state, action, reward, next_state, done)
306+
state = next_state
307+
308+
return info['cur_val']
309+
310+
311+
312+
if __name__ == '__main__':
313+
314+
# config
315+
models_folder = 'linear_rl_trader_models'
316+
rewards_folder = 'linear_rl_trader_rewards'
317+
num_episodes = 2000
318+
batch_size = 32
319+
initial_investment = 20000
320+
321+
322+
parser = argparse.ArgumentParser()
323+
parser.add_argument('-m', '--mode', type=str, required=True,
324+
help='either "train" or "test"')
325+
args = parser.parse_args()
326+
327+
maybe_make_dir(models_folder)
328+
maybe_make_dir(rewards_folder)
329+
330+
data = get_data()
331+
n_timesteps, n_stocks = data.shape
332+
333+
n_train = n_timesteps // 2
334+
335+
train_data = data[:n_train]
336+
test_data = data[n_train:]
337+
338+
env = MultiStockEnv(train_data, initial_investment)
339+
state_size = env.state_dim
340+
action_size = len(env.action_space)
341+
agent = DQNAgent(state_size, action_size)
342+
scaler = get_scaler(env)
343+
344+
# store the final value of the portfolio (end of episode)
345+
portfolio_value = []
346+
347+
if args.mode == 'test':
348+
# then load the previous scaler
349+
with open(f'{models_folder}/scaler.pkl', 'rb') as f:
350+
scaler = pickle.load(f)
351+
352+
# remake the env with test data
353+
env = MultiStockEnv(test_data, initial_investment)
354+
355+
# make sure epsilon is not 1!
356+
# no need to run multiple episodes if epsilon = 0, it's deterministic
357+
agent.epsilon = 0.01
358+
359+
# load trained weights
360+
agent.load(f'{models_folder}/linear.npz')
361+
362+
# play the game num_episodes times
363+
for e in range(num_episodes):
364+
t0 = datetime.now()
365+
val = play_one_episode(agent, env, args.mode)
366+
dt = datetime.now() - t0
367+
print(f"episode: {e + 1}/{num_episodes}, episode end value: {val:.2f}, duration: {dt}")
368+
portfolio_value.append(val) # append episode end portfolio value
369+
370+
# save the weights when we are done
371+
if args.mode == 'train':
372+
# save the DQN
373+
agent.save(f'{models_folder}/linear.npz')
374+
375+
# save the scaler
376+
with open(f'{models_folder}/scaler.pkl', 'wb') as f:
377+
pickle.dump(scaler, f)
378+
379+
# plot losses
380+
plt.plot(agent.model.losses)
381+
plt.show()
382+
383+
384+
# save portfolio value for each episode
385+
np.save(f'{rewards_folder}/{args.mode}.npy', portfolio_value)

0 commit comments

Comments
 (0)