Skip to content

Commit

Permalink
Merge pull request #104 from EssexUniversityMCTS/sldebug
Browse files Browse the repository at this point in the history
fix long waiting for msg in learning server
  • Loading branch information
ljialin authored Dec 13, 2017
2 parents a397786 + 22b6c19 commit d105783
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 103 deletions.
66 changes: 22 additions & 44 deletions clients/GVGAI-JavaClient/src/DontUnderestimateUchiha/Agent.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,27 @@
import serialization.Types;
import utils.ElapsedCpuTimer;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Random;

/**
* This class has been built with a simple design in mind.
* It is to be used to store player agent information,
* to be later used by the client to send and receive information
* to and from the server.
*/
/*
Brief description of the controller
Simple decayed e-Greedy algorithm that keep tracks of average reward changes for each action,
and pick the best one with probability 1-epsilon
*/
public class Agent extends utils.AbstractPlayer {

/**
* Public method to be called at the start of the communication. No game has been initialized yet.
* Perform one-time setup here.
*/

// TODO: 30/06/2017 More optimize with Priority Queue, later
// TODO: 30/06/2017 Might also be better to create a class to keep these all, or use stat summary
HashMap<Types.ACTIONS, Double> averageIncreasedReward;
HashMap<Types.ACTIONS, Integer> counter;
// ArrayList<Types.ACTIONS> actionsNoEscape;
int totalCount;


public Agent(){
averageIncreasedReward = new HashMap<>();
counter = new HashMap<>();
// actionsNoEscape = new ArrayList<>();

}
double prevReward;
// Types.ACTIONS prevAction;
double EPSILON = 0.5;
Random random = new Random();

Expand All @@ -49,24 +39,20 @@ public Agent(){
public void init(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer){

prevReward = 0;

// so you doesn't let me do this in the constructor? Fine, I'll do in the first level
// put all non-nil available actions as keys into each mapper, to initialize
if(averageIncreasedReward.size()==0)
for(Types.ACTIONS action : sso.availableActions)
{
// System.out.println(elapsedTimer.remainingTimeMillis());
if(
// !action.equals(Types.ACTIONS.ACTION_ESCAPE) &&
!action.equals(Types.ACTIONS.ACTION_NIL))
for(Types.ACTIONS action : sso.availableActions)
{
averageIncreasedReward.put(action,0.0);
counter.put(action,0);
totalCount = 0;
// actionsNoEscape.add(action);
// System.out.println(elapsedTimer.remainingTimeMillis());
if(!action.equals(Types.ACTIONS.ACTION_NIL))
{
averageIncreasedReward.put(action,0.0);
counter.put(action,0);
totalCount = 0;
}
}
}

// prevAction = null;
}

/**
Expand All @@ -82,22 +68,20 @@ public void init(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer)
@Override
public Types.ACTIONS act(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer){

// System.out.println(sso.avatarLastAction);
//first step
//first step, or when nil was the last action, pick next action randomly
if(sso.avatarLastAction.equals(Types.ACTIONS.ACTION_NIL))
{
Types.ACTIONS action= sso.availableActions.get(random.nextInt(sso.availableActions.size()));
prevReward = sso.gameScore;
// prevAction = action;
//
// System.out.println(action);

totalCount++;
return action;
}

//update hashmaps
update(sso.gameScore,sso.avatarLastAction);

//select next action using UCB for probability EPSILON
Types.ACTIONS action;
if(random.nextDouble() < EPSILON)
{
Expand All @@ -108,34 +92,31 @@ public Types.ACTIONS act(SerializableStateObservation sso, ElapsedCpuTimer elaps
action = ucbPick();
}

// prevAction = action;
// System.out.println("PICK ACTION "+action+", EPSILON="+EPSILON);

//decreasing EPSILON more when we know more about the game we are playing (really?)
if(EPSILON>0.1)
EPSILON -= 0.0001;

totalCount++;
// System.out.println("hello");
return action;
}

//pure greedy without ucb, not quite good but left here for tribute
public Types.ACTIONS greedyPick()
{
double max = -Double.MAX_VALUE;
Types.ACTIONS maxAction = Types.ACTIONS.ACTION_ESCAPE;
for(Types.ACTIONS action : averageIncreasedReward.keySet())
{
System.out.print(action+":"+averageIncreasedReward.get(action)+", "+counter.get(action)+" | ");
if(averageIncreasedReward.get(action)>max) {
max = averageIncreasedReward.get(action);
maxAction = action;
}
}
System.out.println("MAX ACTION: "+maxAction);

return maxAction;
}

// Pick next action using UCB equation
public Types.ACTIONS ucbPick()
{
double maxUCB = -Double.MAX_VALUE;
Expand All @@ -158,7 +139,6 @@ public Types.ACTIONS ucbPick()
}

private void update(double curReward, Types.ACTIONS prevAction){
// double curReward = sso.gameScore;
double difReward = curReward-prevReward;
prevReward = curReward;
int counterPrevAction = counter.get(prevAction);
Expand Down Expand Up @@ -191,8 +171,6 @@ public int result(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer
update(-100,sso.avatarLastAction);
}

// TODO: 03/07/2017 do whatever learning here before return the next level

return level;
}

Expand Down
47 changes: 18 additions & 29 deletions clients/GVGAI-JavaClient/src/kkunan/Agent.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,14 @@
import java.util.HashMap;
import java.util.Random;

/**
* This class has been built with a simple design in mind.
* It is to be used to store player agent information,
* to be later used by the client to send and receive information
* to and from the server.
*/
public class Agent extends utils.AbstractPlayer {

/*
Brief description of the controller
Simple Q-learning using 'most' of the Avatar information.
Limit health point was not used because it was difficult to scale.
Avatar position was not used because the screen sizes can be different, so nothing to rely on in that case.
/**
* Public method to be called at the start of the communication. No game has been initialized yet.
* Perform one-time setup here.
*/
*/
public class Agent extends utils.AbstractPlayer {

boolean veryFirstTime = true;
Random random;
Expand All @@ -43,17 +38,10 @@ public Agent(){
random = new Random();
}

/**
* Public method to be called at the start of every level of a game.
* Perform any level-entry initialization here.
* @param sso Phase Observation of the current game.
* @param elapsedTimer Timer (1s)
*/
@Override
public void init(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer){
if(veryFirstTime)
{
// TODO: 05/07/2017 do whatever initialize the things
previousState = null;
previousReward = 0;
}
Expand All @@ -72,7 +60,7 @@ public void init(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer)
*/
@Override
public Types.ACTIONS act(SerializableStateObservation sso, ElapsedCpuTimer elapsedTimer){
// System.out.println("start "+elapsedTimer.remainingTimeMillis());

LearningState currentState = new AvatarInfoState(sso);

//2nd step onwards, "assume" that we have previous state stored (and should be)
Expand All @@ -82,24 +70,28 @@ public Types.ACTIONS act(SerializableStateObservation sso, ElapsedCpuTimer elaps
HashMap<Types.ACTIONS,Double> mapper = QValues.get(previousState);
Types.ACTIONS lastAction = sso.avatarLastAction;

//update the previous values if we have one stored
if(mapper.containsKey(lastAction))
{
double oldQ = mapper.get(lastAction);
double plusReward = sso.gameScore-previousReward;

//actual Q-learning equation
double newQ = oldQ + ALPHA*(plusReward + GAMMA*(getMaxQNext(previousState))-oldQ);
// System.out.println(plusReward+" "+oldQ+" "+getMaxQNext(previousState)+" "+newQ);
// System.out.println("new "+newQ+", old "+oldQ);
mapper.replace(lastAction,newQ);
}

//or just put game score if we haven't found this before
else
{
mapper.put(lastAction,new Double(sso.gameScore));
}
}

Types.ACTIONS toActAction;

//bad style of code, but I like it
//basically just put new key in mapper if we don't have it
if(QValues.containsKey(currentState))
{

Expand All @@ -111,21 +103,20 @@ public Types.ACTIONS act(SerializableStateObservation sso, ElapsedCpuTimer elaps
QValues.put(currentState,mapper);
}

//get the best action with probability 1-EPSILON, if we're still learning, if validating, just pick the best we know
if(random.nextDouble() > EPSILON || sso.isValidation) {
toActAction = getMaxAction(currentState, sso.availableActions);
}
//otherwise pick randomly
else toActAction = sso.availableActions.get(random.nextInt(sso.availableActions.size()));

// System.out.println(toActAction);
// printList(AvatarInfoState.generateFeatureFromState(currentState));
// System.out.println(QValues.keySet().size());
previousState = currentState;
previousReward = sso.gameScore;

System.out.println(sso.gameTick+": done "+elapsedTimer.elapsedMillis()+" "+elapsedTimer.remainingTimeMillis());
return toActAction;
}

//just a normal find the best action that seems unnecessary long
private Types.ACTIONS getMaxAction(LearningState state, ArrayList<Types.ACTIONS> actions) {
int index = random.nextInt(actions.size());
try {
Expand All @@ -141,8 +132,6 @@ private Types.ACTIONS getMaxAction(LearningState state, ArrayList<Types.ACTIONS>
Types.ACTIONS maxAction = mapper.keySet().iterator().next();

for (Types.ACTIONS action : mapper.keySet()) {
// System.out.println(maxAction+" "+mapper.get(maxAction)+", "+action+" "+mapper.get(action));

if (mapper.get(maxAction) < mapper.get(action))
maxAction = action;
}
Expand All @@ -155,8 +144,8 @@ private Types.ACTIONS getMaxAction(LearningState state, ArrayList<Types.ACTIONS>
return actions.get(index);
}

//Another get max, but for the next possible states, given the current one
private double getMaxQNext(LearningState state) {

try {
if (!QValues.containsKey(state))
return 0;
Expand Down
Loading

0 comments on commit d105783

Please sign in to comment.