Merge pull request #104 from EssexUniversityMCTS/sldebug

fix long waiting for msg in learning server
GAIGResearch · Dec 13, 2017 · d105783 · d105783
2 parents a397786 + 22b6c19
commit d105783
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 103 deletions.
diff --git a/clients/GVGAI-JavaClient/src/DontUnderestimateUchiha/Agent.java b/clients/GVGAI-JavaClient/src/DontUnderestimateUchiha/Agent.java
@@ -4,37 +4,27 @@
 import serialization.Types;
 import utils.ElapsedCpuTimer;
 
-import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Random;
 
-/**
- * This class has been built with a simple design in mind.
- * It is to be used to store player agent information,
- * to be later used by the client to send and receive information
- * to and from the server.
- */
+/*
+Brief description of the controller
+      Simple decayed e-Greedy algorithm that keep tracks of average reward changes for each action,
+      and pick the best one with probability 1-epsilon
+*/
 public class Agent extends utils.AbstractPlayer {
 
-    /**
-     * Public method to be called at the start of the communication. No game has been initialized yet.
-     * Perform one-time setup here.
-     */
-
-    // TODO: 30/06/2017 More optimize with Priority Queue, later
-    // TODO: 30/06/2017 Might also be better to create a class to keep these all, or use stat summary
     HashMap<Types.ACTIONS, Double> averageIncreasedReward;
     HashMap<Types.ACTIONS, Integer> counter;
-//    ArrayList<Types.ACTIONS> actionsNoEscape;
     int totalCount;
 
+
     public Agent(){
         averageIncreasedReward = new HashMap<>();
         counter = new HashMap<>();
-//        actionsNoEscape = new ArrayList<>();
+
     }
     double prevReward;
-//    Types.ACTIONS prevAction;
     double EPSILON = 0.5;
     Random random = new Random();
 
@@ -49,24 +39,20 @@ public Agent(){
     public void init(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer){
 
         prevReward = 0;
+
         // so you doesn't let me do this in the constructor? Fine, I'll do in the first level
+        // put all non-nil available actions as keys into each mapper, to initialize
         if(averageIncreasedReward.size()==0)
-        for(Types.ACTIONS action : sso.availableActions)
-        {
-        //    System.out.println(elapsedTimer.remainingTimeMillis());
-            if(
-//                    !action.equals(Types.ACTIONS.ACTION_ESCAPE) &&
-                        !action.equals(Types.ACTIONS.ACTION_NIL))
+            for(Types.ACTIONS action : sso.availableActions)
             {
-                averageIncreasedReward.put(action,0.0);
-                counter.put(action,0);
-                totalCount = 0;
-//                actionsNoEscape.add(action);
-        //        System.out.println(elapsedTimer.remainingTimeMillis());
+                if(!action.equals(Types.ACTIONS.ACTION_NIL))
+                {
+                    averageIncreasedReward.put(action,0.0);
+                    counter.put(action,0);
+                    totalCount = 0;
+                }
             }
-        }
 
-//        prevAction = null;
     }
 
     /**
@@ -82,22 +68,20 @@ public void init(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer)
     @Override
     public Types.ACTIONS act(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer){
 
-//        System.out.println(sso.avatarLastAction);
-        //first step
+        //first step, or when nil was the last action, pick next action randomly
         if(sso.avatarLastAction.equals(Types.ACTIONS.ACTION_NIL))
         {
             Types.ACTIONS action= sso.availableActions.get(random.nextInt(sso.availableActions.size()));
             prevReward = sso.gameScore;
-//            prevAction = action;
-//
-//            System.out.println(action);
 
             totalCount++;
             return action;
         }
 
+        //update hashmaps
         update(sso.gameScore,sso.avatarLastAction);
 
+        //select next action using UCB for probability EPSILON
         Types.ACTIONS action;
         if(random.nextDouble() < EPSILON)
         {
@@ -108,34 +92,31 @@ public Types.ACTIONS act(SerializableStateObservation sso, ElapsedCpuTimer elaps
             action = ucbPick();
         }
 
-//        prevAction = action;
-//        System.out.println("PICK ACTION "+action+", EPSILON="+EPSILON);
-
+        //decreasing EPSILON more when we know more about the game we are playing (really?)
         if(EPSILON>0.1)
             EPSILON -= 0.0001;
 
         totalCount++;
-//        System.out.println("hello");
         return action;
     }
 
+    //pure greedy without ucb, not quite good but left here for tribute
     public Types.ACTIONS greedyPick()
     {
         double max = -Double.MAX_VALUE;
         Types.ACTIONS maxAction = Types.ACTIONS.ACTION_ESCAPE;
         for(Types.ACTIONS action : averageIncreasedReward.keySet())
         {
-            System.out.print(action+":"+averageIncreasedReward.get(action)+", "+counter.get(action)+" | ");
             if(averageIncreasedReward.get(action)>max) {
                 max = averageIncreasedReward.get(action);
                 maxAction = action;
             }
         }
-        System.out.println("MAX ACTION: "+maxAction);
 
         return maxAction;
     }
 
+    // Pick next action using UCB equation
     public Types.ACTIONS ucbPick()
     {
         double maxUCB = -Double.MAX_VALUE;
@@ -158,7 +139,6 @@ public Types.ACTIONS ucbPick()
     }
 
     private void update(double curReward, Types.ACTIONS prevAction){
-//        double curReward = sso.gameScore;
         double difReward = curReward-prevReward;
         prevReward = curReward;
         int counterPrevAction = counter.get(prevAction);
@@ -191,8 +171,6 @@ public int result(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer
             update(-100,sso.avatarLastAction);
         }
 
-        // TODO: 03/07/2017 do whatever learning here before return the next level 
-
         return level;
     }
 

diff --git a/clients/GVGAI-JavaClient/src/kkunan/Agent.java b/clients/GVGAI-JavaClient/src/kkunan/Agent.java
@@ -14,19 +14,14 @@
 import java.util.HashMap;
 import java.util.Random;
 
-/**
- * This class has been built with a simple design in mind.
- * It is to be used to store player agent information,
- * to be later used by the client to send and receive information
- * to and from the server.
- */
-public class Agent extends utils.AbstractPlayer {
-
+/*
+Brief description of the controller
+      Simple Q-learning using 'most' of the Avatar information.
+      Limit health point was not used because it was difficult to scale.
+      Avatar position was not used because the screen sizes can be different, so nothing to rely on in that case.
 
-    /**
-     * Public method to be called at the start of the communication. No game has been initialized yet.
-     * Perform one-time setup here.
-     */
+*/
+public class Agent extends utils.AbstractPlayer {
 
     boolean veryFirstTime = true;
     Random random;
@@ -43,17 +38,10 @@ public Agent(){
         random = new Random();
     }
 
-    /**
-     * Public method to be called at the start of every level of a game.
-     * Perform any level-entry initialization here.
-     * @param sso Phase Observation of the current game.
-     * @param elapsedTimer Timer (1s)
-     */
     @Override
     public void init(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer){
         if(veryFirstTime)
         {
-            // TODO: 05/07/2017 do whatever initialize the things
             previousState = null;
             previousReward = 0;
         }
@@ -72,7 +60,7 @@ public void init(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer)
      */
     @Override
     public Types.ACTIONS act(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer){
-//        System.out.println("start "+elapsedTimer.remainingTimeMillis());
+
         LearningState currentState = new AvatarInfoState(sso);
 
         //2nd step onwards, "assume" that we have previous state stored (and should be)
@@ -82,24 +70,28 @@ public Types.ACTIONS act(SerializableStateObservation sso, ElapsedCpuTimer elaps
             HashMap<Types.ACTIONS,Double> mapper = QValues.get(previousState);
             Types.ACTIONS lastAction = sso.avatarLastAction;
 
+            //update the previous values if we have one stored
             if(mapper.containsKey(lastAction))
             {
                 double oldQ = mapper.get(lastAction);
                 double plusReward = sso.gameScore-previousReward;
 
+                //actual Q-learning equation
                 double newQ = oldQ + ALPHA*(plusReward + GAMMA*(getMaxQNext(previousState))-oldQ);
-//                System.out.println(plusReward+" "+oldQ+" "+getMaxQNext(previousState)+" "+newQ);
-//                System.out.println("new "+newQ+", old "+oldQ);
                 mapper.replace(lastAction,newQ);
             }
 
+            //or just put game score if we haven't found this before
             else
             {
                 mapper.put(lastAction,new Double(sso.gameScore));
             }
         }
 
         Types.ACTIONS toActAction;
+
+        //bad style of code, but I like it
+        //basically just put new key in mapper if we don't have it
         if(QValues.containsKey(currentState))
         {
 
@@ -111,21 +103,20 @@ public Types.ACTIONS act(SerializableStateObservation sso, ElapsedCpuTimer elaps
             QValues.put(currentState,mapper);
         }
 
+        //get the best action with probability 1-EPSILON, if we're still learning, if validating, just pick the best we know
         if(random.nextDouble() > EPSILON || sso.isValidation) {
             toActAction = getMaxAction(currentState, sso.availableActions);
         }
+        //otherwise pick randomly
         else toActAction = sso.availableActions.get(random.nextInt(sso.availableActions.size()));
 
-//        System.out.println(toActAction);
-//        printList(AvatarInfoState.generateFeatureFromState(currentState));
-//        System.out.println(QValues.keySet().size());
         previousState = currentState;
         previousReward = sso.gameScore;
 
-        System.out.println(sso.gameTick+": done "+elapsedTimer.elapsedMillis()+" "+elapsedTimer.remainingTimeMillis());
         return toActAction;
     }
 
+    //just a normal find the best action that seems unnecessary long
     private Types.ACTIONS getMaxAction(LearningState state, ArrayList<Types.ACTIONS> actions) {
         int index = random.nextInt(actions.size());
         try {
@@ -141,8 +132,6 @@ private Types.ACTIONS getMaxAction(LearningState state, ArrayList<Types.ACTIONS>
             Types.ACTIONS maxAction = mapper.keySet().iterator().next();
 
             for (Types.ACTIONS action : mapper.keySet()) {
-//                System.out.println(maxAction+" "+mapper.get(maxAction)+", "+action+" "+mapper.get(action));
-
                 if (mapper.get(maxAction) < mapper.get(action))
                     maxAction = action;
             }
@@ -155,8 +144,8 @@ private Types.ACTIONS getMaxAction(LearningState state, ArrayList<Types.ACTIONS>
         return actions.get(index);
     }
 
+    //Another get max, but for the next possible states, given the current one
     private double getMaxQNext(LearningState state) {
-
         try {
             if (!QValues.containsKey(state))
                 return 0;