|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +import matplotlib.pyplot as plt |
| 4 | + |
| 5 | +from datetime import datetime |
| 6 | +import itertools |
| 7 | +import argparse |
| 8 | +import re |
| 9 | +import os |
| 10 | +import pickle |
| 11 | + |
| 12 | +from sklearn.preprocessing import StandardScaler |
| 13 | + |
| 14 | + |
| 15 | +# Let's use AAPL (Apple), MSI (Motorola), SBUX (Starbucks) |
| 16 | +def get_data(): |
| 17 | + # returns a T x 3 list of stock prices |
| 18 | + # each row is a different stock |
| 19 | + # 0 = AAPL |
| 20 | + # 1 = MSI |
| 21 | + # 2 = SBUX |
| 22 | + df = pd.read_csv('../tf2.0/aapl_msi_sbux.csv') |
| 23 | + return df.values |
| 24 | + |
| 25 | + |
| 26 | + |
| 27 | + |
| 28 | + |
| 29 | +def get_scaler(env): |
| 30 | + # return scikit-learn scaler object to scale the states |
| 31 | + # Note: you could also populate the replay buffer here |
| 32 | + |
| 33 | + states = [] |
| 34 | + for _ in range(env.n_step): |
| 35 | + action = np.random.choice(env.action_space) |
| 36 | + state, reward, done, info = env.step(action) |
| 37 | + states.append(state) |
| 38 | + if done: |
| 39 | + break |
| 40 | + |
| 41 | + scaler = StandardScaler() |
| 42 | + scaler.fit(states) |
| 43 | + return scaler |
| 44 | + |
| 45 | + |
| 46 | + |
| 47 | + |
| 48 | +def maybe_make_dir(directory): |
| 49 | + if not os.path.exists(directory): |
| 50 | + os.makedirs(directory) |
| 51 | + |
| 52 | + |
| 53 | + |
| 54 | +class LinearModel: |
| 55 | + """ A linear regression model """ |
| 56 | + def __init__(self, input_dim, n_action): |
| 57 | + self.W = np.random.randn(input_dim, n_action) / np.sqrt(input_dim) |
| 58 | + self.b = np.zeros(n_action) |
| 59 | + |
| 60 | + # momentum terms |
| 61 | + self.vW = 0 |
| 62 | + self.vb = 0 |
| 63 | + |
| 64 | + self.losses = [] |
| 65 | + |
| 66 | + def predict(self, X): |
| 67 | + # make sure X is N x D |
| 68 | + assert(len(X.shape) == 2) |
| 69 | + return X.dot(self.W) + self.b |
| 70 | + |
| 71 | + def sgd(self, X, Y, learning_rate=0.01, momentum=0.9): |
| 72 | + # make sure X is N x D |
| 73 | + assert(len(X.shape) == 2) |
| 74 | + |
| 75 | + # the loss values are 2-D |
| 76 | + # normally we would divide by N only |
| 77 | + # but now we divide by N x K |
| 78 | + num_values = np.prod(Y.shape) |
| 79 | + |
| 80 | + # do one step of gradient descent |
| 81 | + # we multiply by 2 to get the exact gradient |
| 82 | + # (not adjusting the learning rate) |
| 83 | + # i.e. d/dx (x^2) --> 2x |
| 84 | + Yhat = self.predict(X) |
| 85 | + gW = 2 * X.T.dot(Yhat - Y) / num_values |
| 86 | + gb = 2 * (Yhat - Y).sum(axis=0) / num_values |
| 87 | + |
| 88 | + # update momentum terms |
| 89 | + self.vW = momentum * self.vW - learning_rate * gW |
| 90 | + self.vb = momentum * self.vb - learning_rate * gb |
| 91 | + |
| 92 | + # update params |
| 93 | + self.W += self.vW |
| 94 | + self.b += self.vb |
| 95 | + |
| 96 | + mse = np.mean((Yhat - Y)**2) |
| 97 | + self.losses.append(mse) |
| 98 | + |
| 99 | + def load_weights(self, filepath): |
| 100 | + npz = np.load(filepath) |
| 101 | + self.W = npz['W'] |
| 102 | + self.b = npz['b'] |
| 103 | + |
| 104 | + def save_weights(self, filepath): |
| 105 | + np.savez(filepath, W=self.W, b=self.b) |
| 106 | + |
| 107 | + |
| 108 | + |
| 109 | + |
| 110 | +class MultiStockEnv: |
| 111 | + """ |
| 112 | + A 3-stock trading environment. |
| 113 | + State: vector of size 7 (n_stock * 2 + 1) |
| 114 | + - # shares of stock 1 owned |
| 115 | + - # shares of stock 2 owned |
| 116 | + - # shares of stock 3 owned |
| 117 | + - price of stock 1 (using daily close price) |
| 118 | + - price of stock 2 |
| 119 | + - price of stock 3 |
| 120 | + - cash owned (can be used to purchase more stocks) |
| 121 | + Action: categorical variable with 27 (3^3) possibilities |
| 122 | + - for each stock, you can: |
| 123 | + - 0 = sell |
| 124 | + - 1 = hold |
| 125 | + - 2 = buy |
| 126 | + """ |
| 127 | + def __init__(self, data, initial_investment=20000): |
| 128 | + # data |
| 129 | + self.stock_price_history = data |
| 130 | + self.n_step, self.n_stock = self.stock_price_history.shape |
| 131 | + |
| 132 | + # instance attributes |
| 133 | + self.initial_investment = initial_investment |
| 134 | + self.cur_step = None |
| 135 | + self.stock_owned = None |
| 136 | + self.stock_price = None |
| 137 | + self.cash_in_hand = None |
| 138 | + |
| 139 | + self.action_space = np.arange(3**self.n_stock) |
| 140 | + |
| 141 | + # action permutations |
| 142 | + # returns a nested list with elements like: |
| 143 | + # [0,0,0] |
| 144 | + # [0,0,1] |
| 145 | + # [0,0,2] |
| 146 | + # [0,1,0] |
| 147 | + # [0,1,1] |
| 148 | + # etc. |
| 149 | + # 0 = sell |
| 150 | + # 1 = hold |
| 151 | + # 2 = buy |
| 152 | + self.action_list = list(map(list, itertools.product([0, 1, 2], repeat=self.n_stock))) |
| 153 | + |
| 154 | + # calculate size of state |
| 155 | + self.state_dim = self.n_stock * 2 + 1 |
| 156 | + |
| 157 | + self.reset() |
| 158 | + |
| 159 | + |
| 160 | + def reset(self): |
| 161 | + self.cur_step = 0 |
| 162 | + self.stock_owned = np.zeros(self.n_stock) |
| 163 | + self.stock_price = self.stock_price_history[self.cur_step] |
| 164 | + self.cash_in_hand = self.initial_investment |
| 165 | + return self._get_obs() |
| 166 | + |
| 167 | + |
| 168 | + def step(self, action): |
| 169 | + assert action in self.action_space |
| 170 | + |
| 171 | + # get current value before performing the action |
| 172 | + prev_val = self._get_val() |
| 173 | + |
| 174 | + # update price, i.e. go to the next day |
| 175 | + self.cur_step += 1 |
| 176 | + self.stock_price = self.stock_price_history[self.cur_step] |
| 177 | + |
| 178 | + # perform the trade |
| 179 | + self._trade(action) |
| 180 | + |
| 181 | + # get the new value after taking the action |
| 182 | + cur_val = self._get_val() |
| 183 | + |
| 184 | + # reward is the increase in porfolio value |
| 185 | + reward = cur_val - prev_val |
| 186 | + |
| 187 | + # done if we have run out of data |
| 188 | + done = self.cur_step == self.n_step - 1 |
| 189 | + |
| 190 | + # store the current value of the portfolio here |
| 191 | + info = {'cur_val': cur_val} |
| 192 | + |
| 193 | + # conform to the Gym API |
| 194 | + return self._get_obs(), reward, done, info |
| 195 | + |
| 196 | + |
| 197 | + def _get_obs(self): |
| 198 | + obs = np.empty(self.state_dim) |
| 199 | + obs[:self.n_stock] = self.stock_owned |
| 200 | + obs[self.n_stock:2*self.n_stock] = self.stock_price |
| 201 | + obs[-1] = self.cash_in_hand |
| 202 | + return obs |
| 203 | + |
| 204 | + |
| 205 | + |
| 206 | + def _get_val(self): |
| 207 | + return self.stock_owned.dot(self.stock_price) + self.cash_in_hand |
| 208 | + |
| 209 | + |
| 210 | + def _trade(self, action): |
| 211 | + # index the action we want to perform |
| 212 | + # 0 = sell |
| 213 | + # 1 = hold |
| 214 | + # 2 = buy |
| 215 | + # e.g. [2,1,0] means: |
| 216 | + # buy first stock |
| 217 | + # hold second stock |
| 218 | + # sell third stock |
| 219 | + action_vec = self.action_list[action] |
| 220 | + |
| 221 | + # determine which stocks to buy or sell |
| 222 | + sell_index = [] # stores index of stocks we want to sell |
| 223 | + buy_index = [] # stores index of stocks we want to buy |
| 224 | + for i, a in enumerate(action_vec): |
| 225 | + if a == 0: |
| 226 | + sell_index.append(i) |
| 227 | + elif a == 2: |
| 228 | + buy_index.append(i) |
| 229 | + |
| 230 | + # sell any stocks we want to sell |
| 231 | + # then buy any stocks we want to buy |
| 232 | + if sell_index: |
| 233 | + # NOTE: to simplify the problem, when we sell, we will sell ALL shares of that stock |
| 234 | + for i in sell_index: |
| 235 | + self.cash_in_hand += self.stock_price[i] * self.stock_owned[i] |
| 236 | + self.stock_owned[i] = 0 |
| 237 | + if buy_index: |
| 238 | + # NOTE: when buying, we will loop through each stock we want to buy, |
| 239 | + # and buy one share at a time until we run out of cash |
| 240 | + can_buy = True |
| 241 | + while can_buy: |
| 242 | + for i in buy_index: |
| 243 | + if self.cash_in_hand > self.stock_price[i]: |
| 244 | + self.stock_owned[i] += 1 # buy one share |
| 245 | + self.cash_in_hand -= self.stock_price[i] |
| 246 | + else: |
| 247 | + can_buy = False |
| 248 | + |
| 249 | + |
| 250 | + |
| 251 | + |
| 252 | + |
| 253 | +class DQNAgent(object): |
| 254 | + def __init__(self, state_size, action_size): |
| 255 | + self.state_size = state_size |
| 256 | + self.action_size = action_size |
| 257 | + self.gamma = 0.95 # discount rate |
| 258 | + self.epsilon = 1.0 # exploration rate |
| 259 | + self.epsilon_min = 0.01 |
| 260 | + self.epsilon_decay = 0.995 |
| 261 | + self.model = LinearModel(state_size, action_size) |
| 262 | + |
| 263 | + def act(self, state): |
| 264 | + if np.random.rand() <= self.epsilon: |
| 265 | + return np.random.choice(self.action_size) |
| 266 | + act_values = self.model.predict(state) |
| 267 | + return np.argmax(act_values[0]) # returns action |
| 268 | + |
| 269 | + |
| 270 | + def train(self, state, action, reward, next_state, done): |
| 271 | + if done: |
| 272 | + target = reward |
| 273 | + else: |
| 274 | + target = reward + self.gamma * np.amax(self.model.predict(next_state), axis=1) |
| 275 | + |
| 276 | + target_full = self.model.predict(state) |
| 277 | + target_full[0, action] = target |
| 278 | + |
| 279 | + # Run one training step |
| 280 | + self.model.sgd(state, target_full) |
| 281 | + |
| 282 | + if self.epsilon > self.epsilon_min: |
| 283 | + self.epsilon *= self.epsilon_decay |
| 284 | + |
| 285 | + |
| 286 | + def load(self, name): |
| 287 | + self.model.load_weights(name) |
| 288 | + |
| 289 | + |
| 290 | + def save(self, name): |
| 291 | + self.model.save_weights(name) |
| 292 | + |
| 293 | + |
| 294 | +def play_one_episode(agent, env, is_train): |
| 295 | + # note: after transforming states are already 1xD |
| 296 | + state = env.reset() |
| 297 | + state = scaler.transform([state]) |
| 298 | + done = False |
| 299 | + |
| 300 | + while not done: |
| 301 | + action = agent.act(state) |
| 302 | + next_state, reward, done, info = env.step(action) |
| 303 | + next_state = scaler.transform([next_state]) |
| 304 | + if is_train == 'train': |
| 305 | + agent.train(state, action, reward, next_state, done) |
| 306 | + state = next_state |
| 307 | + |
| 308 | + return info['cur_val'] |
| 309 | + |
| 310 | + |
| 311 | + |
| 312 | +if __name__ == '__main__': |
| 313 | + |
| 314 | + # config |
| 315 | + models_folder = 'linear_rl_trader_models' |
| 316 | + rewards_folder = 'linear_rl_trader_rewards' |
| 317 | + num_episodes = 2000 |
| 318 | + batch_size = 32 |
| 319 | + initial_investment = 20000 |
| 320 | + |
| 321 | + |
| 322 | + parser = argparse.ArgumentParser() |
| 323 | + parser.add_argument('-m', '--mode', type=str, required=True, |
| 324 | + help='either "train" or "test"') |
| 325 | + args = parser.parse_args() |
| 326 | + |
| 327 | + maybe_make_dir(models_folder) |
| 328 | + maybe_make_dir(rewards_folder) |
| 329 | + |
| 330 | + data = get_data() |
| 331 | + n_timesteps, n_stocks = data.shape |
| 332 | + |
| 333 | + n_train = n_timesteps // 2 |
| 334 | + |
| 335 | + train_data = data[:n_train] |
| 336 | + test_data = data[n_train:] |
| 337 | + |
| 338 | + env = MultiStockEnv(train_data, initial_investment) |
| 339 | + state_size = env.state_dim |
| 340 | + action_size = len(env.action_space) |
| 341 | + agent = DQNAgent(state_size, action_size) |
| 342 | + scaler = get_scaler(env) |
| 343 | + |
| 344 | + # store the final value of the portfolio (end of episode) |
| 345 | + portfolio_value = [] |
| 346 | + |
| 347 | + if args.mode == 'test': |
| 348 | + # then load the previous scaler |
| 349 | + with open(f'{models_folder}/scaler.pkl', 'rb') as f: |
| 350 | + scaler = pickle.load(f) |
| 351 | + |
| 352 | + # remake the env with test data |
| 353 | + env = MultiStockEnv(test_data, initial_investment) |
| 354 | + |
| 355 | + # make sure epsilon is not 1! |
| 356 | + # no need to run multiple episodes if epsilon = 0, it's deterministic |
| 357 | + agent.epsilon = 0.01 |
| 358 | + |
| 359 | + # load trained weights |
| 360 | + agent.load(f'{models_folder}/linear.npz') |
| 361 | + |
| 362 | + # play the game num_episodes times |
| 363 | + for e in range(num_episodes): |
| 364 | + t0 = datetime.now() |
| 365 | + val = play_one_episode(agent, env, args.mode) |
| 366 | + dt = datetime.now() - t0 |
| 367 | + print(f"episode: {e + 1}/{num_episodes}, episode end value: {val:.2f}, duration: {dt}") |
| 368 | + portfolio_value.append(val) # append episode end portfolio value |
| 369 | + |
| 370 | + # save the weights when we are done |
| 371 | + if args.mode == 'train': |
| 372 | + # save the DQN |
| 373 | + agent.save(f'{models_folder}/linear.npz') |
| 374 | + |
| 375 | + # save the scaler |
| 376 | + with open(f'{models_folder}/scaler.pkl', 'wb') as f: |
| 377 | + pickle.dump(scaler, f) |
| 378 | + |
| 379 | + # plot losses |
| 380 | + plt.plot(agent.model.losses) |
| 381 | + plt.show() |
| 382 | + |
| 383 | + |
| 384 | + # save portfolio value for each episode |
| 385 | + np.save(f'{rewards_folder}/{args.mode}.npy', portfolio_value) |
0 commit comments