maxpumperla
diff --git a/‎code/dlgo/agent/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎code/dlgo/agent/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎code/dlgo/agent/alphago.py‎
Lines changed: 168 additions & 0 deletions b/‎code/dlgo/agent/alphago.py‎
Lines changed: 168 additions & 0 deletions
diff --git a/‎code/dlgo/agent/alphago_test.py‎
Lines changed: 86 additions & 0 deletions b/‎code/dlgo/agent/alphago_test.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎code/dlgo/agent/base.py‎
Lines changed: 4 additions & 2 deletions b/‎code/dlgo/agent/base.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎code/dlgo/agent/naive_fast.py‎
Lines changed: 1 addition & 0 deletions b/‎code/dlgo/agent/naive_fast.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎code/dlgo/agent/pg.py‎
Lines changed: 8 additions & 2 deletions b/‎code/dlgo/agent/pg.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎code/dlgo/agent/predict.py‎
Lines changed: 16 additions & 12 deletions b/‎code/dlgo/agent/predict.py‎
Lines changed: 16 additions & 12 deletions
@@ -1,5 +1,7 @@
+from .alphago import *
 from .base import *
 from .pg import *
 from .predict import *
 from .naive import *
 from .naive_fast import *
+from .termination import *
@@ -0,0 +1,168 @@
+# tag::alphago_imports[]
+import numpy as np
+from dlgo.agent.base import Agent
+from dlgo.goboard_fast import Move
+from dlgo import kerasutil
+import operator
+# end::alphago_imports[]
+
+
+__all__ = [
+    'AlphaGoNode',
+    'AlphaGoMCTS'
+]
+
+
+# tag::init_alphago_node[]
+class AlphaGoNode:
+    def __init__(self, parent=None, probability=1.0):
+        self.parent = parent  # <1>
+        self.children = {}  # <1>
+
+        self.visit_count = 0
+        self.q_value = 0
+        self.prior_value = probability  # <2>
+        self.u_value = probability  # <3>
+# <1> Tree nodes have one parent and potentially many children.
+# <2> A node is initialized with a prior probability.
+# <3> The utility function will be updated during search.
+# end::init_alphago_node[]
+
+# tag::select_node[]
+    def select_child(self):
+        return max(self.children.items(),
+                   key=lambda child: child[1].q_value + \
+                   child[1].u_value)
+# end::select_node[]
+
+# tag::expand_children[]
+    def expand_children(self, moves, probabilities):
+        for move, prob in zip(moves, probabilities):
+            if move not in self.children:
+                self.children[move] = AlphaGoNode(probability=prob)
+# end::expand_children[]
+
+# tag::update_values[]
+    def update_values(self, leaf_value):
+        if self.parent is not None:
+            self.parent.update_values(leaf_value)  # <1>
+
+        self.visit_count += 1  # <2>
+
+        self.q_value += leaf_value / self.visit_count  # <3>
+
+        if self.parent is not None:
+            c_u = 5
+            self.u_value = c_u * np.sqrt(self.parent.visit_count) \
+                * self.prior_value / (1 + self.visit_count)  # <4>
+
+# <1> We update parents first to ensure we traverse the tree top to bottom.
+# <2> Increment the visit count for this node.
+# <3> Add the specified leaf value to the Q-value, normalized by visit count.
+# <4> Update utility with current visit counts.
+# end::update_values[]
+
+
+# tag::alphago_mcts_init[]
+class AlphaGoMCTS(Agent):
+    def __init__(self, policy_agent, fast_policy_agent, value_agent,
+                 lambda_value=0.5, num_simulations=1000,
+                 depth=50, rollout_limit=100):
+        self.policy = policy_agent
+        self.rollout_policy = fast_policy_agent
+        self.value = value_agent
+
+        self.lambda_value = lambda_value
+        self.num_simulations = num_simulations
+        self.depth = depth
+        self.rollout_limit = rollout_limit
+        self.root = AlphaGoNode()
+# end::alphago_mcts_init[]
+
+# tag::alphago_mcts_rollout[]
+    def select_move(self, game_state):
+        for simulation in range(self.num_simulations):  # <1>
+            current_state = game_state
+            node = self.root
+            for depth in range(self.depth):  # <2>
+                if not node.children:  # <3>
+                    if current_state.is_over():
+                        break
+                    moves, probabilities = self.policy_probabilities(current_state)  # <4>
+                    node.expand_children(moves, probabilities)  # <4>
+
+                move, node = node.select_child()  # <5>
+                current_state = current_state.apply_move(move)  # <5>
+
+            value = self.value.predict(current_state)  # <6>
+            rollout = self.policy_rollout(current_state)  # <6>
+
+            weighted_value = (1 - self.lambda_value) * value + \
+                self.lambda_value * rollout  # <7>
+
+            node.update_values(weighted_value)  # <8>
+# <1> From current state play out a number of simulations
+# <2> Play moves until the specified depth is reached.
+# <3> If the current node doesn't have any children...
+# <4> ... expand them with probabilities from the strong policy.
+# <5> If there are children, we can select one and play the corresponding move.
+# <6> Compute output of value network and a rollout by the fast policy.
+# <7> Determine the combined value function.
+# <8> Update values for this node in the backup phase
+# end::alphago_mcts_rollout[]
+
+# tag::alphago_mcts_selection[]
+        move = max(self.root.children, key=lambda move:  # <1>
+                   self.root.children.get(move).visit_count)  # <1>
+
+        self.root = AlphaGoNode()
+        if move in self.root.children:  # <2>
+            self.root = self.root.children[move]
+            self.root.parent = None
+
+        return move
+# <1> Pick most visited child of the root as next move.
+# <2> If the picked move is a child, set new root to this child node.
+# end::alphago_mcts_selection[]
+
+# tag::alphago_policy_probs[]
+    def policy_probabilities(self, game_state):
+        encoder = self.policy._encoder
+        outputs = self.policy.predict(game_state)
+        legal_moves = game_state.legal_moves()
+        if not legal_moves:
+            return [], []
+        encoded_points = [encoder.encode_point(move.point) for move in legal_moves if move.point]
+        legal_outputs = outputs[encoded_points]
+        normalized_outputs = legal_outputs / np.sum(legal_outputs)
+        return legal_moves, normalized_outputs
+# end::alphago_policy_probs[]
+
+# tag::alphago_policy_rollout[]
+    def policy_rollout(self, game_state):
+        for step in range(self.rollout_limit):
+            if game_state.is_over():
+                break
+            move_probabilities = self.rollout_policy.predict(game_state)
+            encoder = self.rollout_policy.encoder
+            valid_moves = [m for idx, m in enumerate(move_probabilities) 
+                           if Move(encoder.decode_point_index(idx)) in game_state.legal_moves()]
+            max_index, max_value = max(enumerate(valid_moves), key=operator.itemgetter(1))
+            max_point = encoder.decode_point_index(max_index)
+            greedy_move = Move(max_point)
+            if greedy_move in game_state.legal_moves():
+                game_state = game_state.apply_move(greedy_move)
+
+        next_player = game_state.next_player
+        winner = game_state.winner()
+        if winner is not None:
+            return 1 if winner == next_player else -1
+        else:
+            return 0
+# end::alphago_policy_rollout[]
+
+
+    def serialize(self, h5file):
+        raise IOError("AlphaGoMCTS agent can\'t be serialized" +
+                       "consider serializing the three underlying" +
+                       "neural networks instad.")
@@ -0,0 +1,86 @@
+import unittest
+
+from dlgo.data.processor import GoDataProcessor
+from dlgo.agent.predict import DeepLearningAgent
+from dlgo.networks.alphago import alphago_model
+from dlgo.agent.pg import PolicyAgent
+from dlgo.agent.predict import load_prediction_agent
+from dlgo.encoders.alphago import AlphaGoEncoder
+from dlgo.rl.simulate import experience_simulation
+from dlgo.networks.alphago import alphago_model
+from dlgo.rl import ValueAgent, load_experience
+from dlgo.agent import load_prediction_agent, load_policy_agent, AlphaGoMCTS
+from dlgo.rl import load_value_agent
+from dlgo.goboard_fast import GameState
+
+from keras.callbacks import ModelCheckpoint
+import h5py
+import numpy as np
+
+class AlphaGoAgentTest(unittest.TestCase):
+    def test_1_supervised_learning(self):
+        rows, cols = 19, 19
+        encoder = AlphaGoEncoder()
+
+        input_shape = (encoder.num_planes, rows, cols)
+        alphago_sl_policy = alphago_model(input_shape, is_policy_net=True)
+
+        alphago_sl_policy.compile('sgd', 'categorical_crossentropy', metrics=['accuracy'])
+
+        alphago_sl_agent = DeepLearningAgent(alphago_sl_policy, encoder)
+
+        inputs = np.ones((10,) + input_shape)
+        outputs = alphago_sl_policy.predict(inputs)
+        assert(outputs.shape == (10, 361))
+
+        with h5py.File('test_alphago_sl_policy.h5', 'w') as sl_agent_out:
+            alphago_sl_agent.serialize(sl_agent_out)
+
+    def test_2_reinforcement_learning(self):
+        encoder = AlphaGoEncoder()
+
+        sl_agent = load_prediction_agent(h5py.File('test_alphago_sl_policy.h5'))
+        sl_opponent = load_prediction_agent(h5py.File('test_alphago_sl_policy.h5'))
+
+        alphago_rl_agent = PolicyAgent(sl_agent.model, encoder)
+        opponent = PolicyAgent(sl_opponent.model, encoder)
+
+        num_games = 1
+        experience = experience_simulation(num_games, alphago_rl_agent, opponent)
+
+        alphago_rl_agent.train(experience)
+
+        with h5py.File('test_alphago_rl_policy.h5', 'w') as rl_agent_out:
+            alphago_rl_agent.serialize(rl_agent_out)
+
+        with h5py.File('test_alphago_rl_experience.h5', 'w') as exp_out:
+           experience.serialize(exp_out)        
+
+    def test_3_alphago_value(self):
+        rows, cols = 19, 19
+        encoder = AlphaGoEncoder()
+        input_shape = (encoder.num_planes, rows, cols)
+        alphago_value_network = alphago_model(input_shape)
+
+        alphago_value = ValueAgent(alphago_value_network, encoder)
+
+        experience = load_experience(h5py.File('test_alphago_rl_experience.h5', 'r'))
+
+        alphago_value.train(experience)
+
+        with h5py.File('test_alphago_value.h5', 'w') as value_agent_out:
+            alphago_value.serialize(value_agent_out)
+
+    def test_4_alphago_mcts(self):
+        fast_policy = load_prediction_agent(h5py.File('test_alphago_sl_policy.h5', 'r'))
+        strong_policy = load_policy_agent(h5py.File('test_alphago_rl_policy.h5', 'r'))
+        value = load_value_agent(h5py.File('test_alphago_value.h5', 'r'))
+
+        alphago = AlphaGoMCTS(strong_policy, fast_policy, value,
+                              num_simulations=20, depth=5, rollout_limit=10)
+        start = GameState.new_game(19)
+        alphago.select_move(start)
+
+
+if __name__ == '__main__':
+    unittest.main()
@@ -4,8 +4,10 @@
 
 
 # tag::agent[]
-class Agent():
-    """Interface for a go-playing bot."""
+class Agent:
+    def __init__(self):
+        pass
+
     def select_move(self, game_state):
         raise NotImplementedError()
 # end::agent[]
 
@@ -11,6 +11,7 @@
 
 class FastRandomBot(Agent):
     def __init__(self):
+        Agent.__init__(self)
         self.dim = None
         self.point_cache = []
 
 
@@ -32,11 +32,17 @@ def normalize(x):
 class PolicyAgent(Agent):
     """An agent that uses a deep policy network to select moves."""
     def __init__(self, model, encoder):
+        Agent.__init__(self)
         self._model = model
         self._encoder = encoder
         self._collector = None
         self._temperature = 0.0
 
+    def predict(self, game_state):
+        encoded_state = self._encoder.encode(game_state)
+        input_tensor = np.array([encoded_state])
+        return self._model.predict(input_tensor)[0]
+
     def set_temperature(self, temperature):
         self._temperature = temperature
 
@@ -47,14 +53,14 @@ def select_move(self, game_state):
         num_moves = self._encoder.board_width * self._encoder.board_height
 
         board_tensor = self._encoder.encode(game_state)
-        X = np.array([board_tensor])
+        x = np.array([board_tensor])
 
         if np.random.random() < self._temperature:
             # Explore random moves.
             move_probs = np.ones(num_moves) / num_moves
         else:
             # Follow our current policy.
-            move_probs = self._model.predict(X)[0]
+            move_probs = self._model.predict(x)[0]
 
         # Prevent move probs from getting stuck at 0 or 1.
         eps = 1e-5
 
@@ -16,16 +16,20 @@
 # tag::dl_agent_init[]
 class DeepLearningAgent(Agent):
     def __init__(self, model, encoder):
-        self._model = model
-        self._encoder = encoder
+        Agent.__init__(self)
+        self.model = model
+        self.encoder = encoder
 # end::dl_agent_init[]
 
 # tag::dl_agent_predict[]
+    def predict(self, game_state):
+        encoded_state = self.encoder.encode(game_state)
+        input_tensor = np.array([encoded_state])
+        return self.model.predict(input_tensor)[0]
+
     def select_move(self, game_state):
-        num_moves = self._encoder.board_width * self._encoder.board_height
-        board_tensor = self._encoder.encode(game_state)
-        X = np.array([board_tensor])
-        move_probs = self._model.predict(X)[0]
+        num_moves = self.encoder.board_width * self.encoder.board_height
+        move_probs = self.predict(game_state)
 # end::dl_agent_predict[]
 
 # tag::dl_agent_probabilities[]
@@ -43,11 +47,11 @@ def select_move(self, game_state):
         ranked_moves = np.random.choice(
             candidates, num_moves, replace=False, p=move_probs)  # <2>
         for point_idx in ranked_moves:
-            point = self._encoder.decode_point_index(point_idx)
+            point = self.encoder.decode_point_index(point_idx)
             if game_state.is_valid_move(goboard.Move.play(point)) and \
                     not is_point_an_eye(game_state.board, point, game_state.next_player):  # <3>
                 return goboard.Move.play(point)
-        return goboard.Move.pass_turn()  # <4> No legal, non-self-destructive moves less.
+        return goboard.Move.pass_turn()  # <4>
 # <1> Turn the probabilities into a ranked list of moves.
 # <2> Sample potential candidates
 # <3> Starting from the top, find a valid move that doesn't reduce eye-space.
@@ -57,11 +61,11 @@ def select_move(self, game_state):
 # tag::dl_agent_serialize[]
     def serialize(self, h5file):
         h5file.create_group('encoder')
-        h5file['encoder'].attrs['name'] = self._encoder.name()
-        h5file['encoder'].attrs['board_width'] = self._encoder.board_width
-        h5file['encoder'].attrs['board_height'] = self._encoder.board_height
+        h5file['encoder'].attrs['name'] = self.encoder.name()
+        h5file['encoder'].attrs['board_width'] = self.encoder.board_width
+        h5file['encoder'].attrs['board_height'] = self.encoder.board_height
         h5file.create_group('model')
-        kerasutil.save_model_to_hdf5_group(self._model, h5file['model'])
+        kerasutil.save_model_to_hdf5_group(self.model, h5file['model'])
 # end::dl_agent_serialize[]