package nl.ru.ai.vroon.mdp;
import java.util.Random;
/**
* @author TES Installatie
*
* The Algorithm class which performs both of the algorithms of this
* exercise.
*/
public class Algorithms {
private final double gamma = 0.1;
private double eta = 0.8;
private final double alpha = 0.9;
private final double side;
private final double fwd;
private final double bck;
private MarkovDecisionProblem mdp;
private VField vf;
private VField qf;
/**
* The algorithm class will keep track of two value fields for the value
* iteration and Q-learning algorithms respectively.
*
* @param mdp
*/
public Algorithms(MarkovDecisionProblem mdp) {
this.mdp = mdp;
this.vf = new VField(mdp.getWidth(), mdp.getHeight());
this.qf = new VField(mdp.getWidth(), mdp.getHeight());
vf.initialize();
qf.initialize();
// The transition probabilities.
side = mdp.getErrorProbability();
fwd = mdp.getProbability();
bck = mdp.getBackProbability();
}
/**
* The Value Iterator algorithm. This algorithm will fill up its respective
* (V)Field with its found values and policies.
*/
public void ValueIterator() {
double[] values = new double[4]; // 0=N,1=E,2=S,3=W
Action[] actions = { Action.UP, Action.RIGHT, Action.DOWN, Action.LEFT };
int x = mdp.getStateXPosition();
int y = mdp.getStateYPostion();
double rN = 0.0, rS = 0.0, rW = 0.0, rE = 0.0;
double vNorth = 0.0, vEast = 0.0, vSouth = 0.0, vWest = 0.0;
// V(NORTH) and reward of the northern state
if (!mdp.getField(x, y + 1).equals(Field.OUTOFBOUNDS) && !mdp.getField(x, y + 1).equals(Field.OBSTACLE)) {
vNorth = qf.getValue(x, y + 1);
rN = mdp.getNextReward(x, y + 1);
} else
values[0] = Double.NEGATIVE_INFINITY;
// V(SOUTH) and reward of the southern state
if (!mdp.getField(x, y - 1).equals(Field.OUTOFBOUNDS) && !mdp.getField(x, y - 1).equals(Field.OBSTACLE)) {
vSouth = qf.getValue(x, y - 1);
rS = mdp.getNextReward(x, y - 1);
} else
values[2] = Double.NEGATIVE_INFINITY;
// V(EAST) and reward of the eastern state
if (!mdp.getField(x + 1, y).equals(Field.OUTOFBOUNDS) && !mdp.getField(x + 1, y).equals(Field.OBSTACLE)) {
vEast = qf.getValue(x + 1, y);
rE = mdp.getNextReward(x + 1, y);
} else
values[1] = Double.NEGATIVE_INFINITY;
// V(WEST) and reward of the western state
if (!mdp.getField(x - 1, y).equals(Field.OUTOFBOUNDS) && !mdp.getField(x - 1, y).equals(Field.OBSTACLE)) {
vWest = qf.getValue(x - 1, y);
rW = mdp.getNextReward(x - 1, y);
} else
values[3] = Double.NEGATIVE_INFINITY;
// Calculating the values for the different actions
double[] temp = new double[4];
temp[0] = (fwd * (rN + gamma * vNorth) + side * (rE + gamma * vEast) + bck * (rS + gamma * vSouth)
+ side * (rW + gamma * vWest));
temp[2] = (fwd * (rS + gamma * vSouth) + side * (rE + gamma * vEast) + bck * (rN + gamma * vNorth)
+ side * (rW + gamma * vWest));
temp[1] = (fwd * (rE + gamma * vEast) + side * (rN + gamma * vNorth) + bck * (rW + gamma * vWest)
+ side * (rS + gamma * vSouth));
temp[3] = (fwd * (rW + gamma * vWest) + side * (rN + gamma * vNorth) + bck * (rE + gamma * vEast)
+ side * (rS + gamma * vSouth));
// Filtering out the impossible actions i.e. if the northern state is an
// obstacle its value is filtered out of the arrays for the maximizing
// part
for (int v = 0; v < 4; v++) {
if (values[v] != Double.NEGATIVE_INFINITY) {
values[v] = temp[v];
}
}
Action action = Action.UP;
// Selecting the highest value and the corresponding action
double maxi = Double.NEGATIVE_INFINITY;
for (int i = 0; i < values.length; i++) {
if (values > maxi) {
maxi = values;
action = actions;
}
}
//System.out.println("Max: " + maxi + " Action:" + action.toString());
// Setting the newly found V value and policy
vf.setValue(x, y, maxi);
vf.setAction(x, y, action);
}
/**
* The Q-Learning algorithm which fills up its respective (Q)Field with its
* found values and policies.
*/
public void qLearning() {
double[] values = new double[4];
int x = mdp.getStateXPosition();
int y = mdp.getStateYPostion();
double rN = 0.0, rS = 0.0, rW = 0.0, rE = 0.0;
double qNorth = 0.0, qEast = 0.0, qSouth = 0.0, qWest = 0.0;
// Q(NORTH,UP) and reward of the northern state
if (!mdp.getField(x, y + 1).equals(Field.OUTOFBOUNDS) && !mdp.getField(x, y + 1).equals(Field.OBSTACLE)) {
qNorth = qf.getValue(x, y + 1);
rN = mdp.getNextReward(x, y + 1);
} else
values[0] = Double.NEGATIVE_INFINITY;
// Q(SOUTH,DOWN) and reward of the southern state
if (!mdp.getField(x, y - 1).equals(Field.OUTOFBOUNDS) && !mdp.getField(x, y - 1).equals(Field.OBSTACLE)) {
qSouth = qf.getValue(x, y - 1);
rS = mdp.getNextReward(x, y - 1);
} else
values[2] = Double.NEGATIVE_INFINITY;
// Q(EAST,RIGHT) and reward of the eastern state
if (!mdp.getField(x + 1, y).equals(Field.OUTOFBOUNDS) && !mdp.getField(x + 1, y).equals(Field.OBSTACLE)) {
qEast = qf.getValue(x + 1, y);
rE = mdp.getNextReward(x + 1, y);
} else
values[1] = Double.NEGATIVE_INFINITY;
// Q(WEST,LEFT) and reward of the western state
if (!mdp.getField(x - 1, y).equals(Field.OUTOFBOUNDS) && !mdp.getField(x - 1, y).equals(Field.OBSTACLE)) {
qWest = qf.getValue(x - 1, y);
rW = mdp.getNextReward(x - 1, y);
} else
values[3] = Double.NEGATIVE_INFINITY;
Action[] actions = { Action.UP, Action.RIGHT, Action.DOWN, Action.LEFT };
// 0=N,1=E,2=S,3=W
// Here the Q[S',A'] values are calculated
double[] temp = new double[4];
temp[0] = qCalc(mdp, Action.UP, vf, qf, qNorth, qSouth, qWest, qEast, rN);
temp[2] = qCalc(mdp, Action.DOWN, vf, qf, qNorth, qSouth, qWest, qEast, rS);
temp[1] = qCalc(mdp, Action.RIGHT, vf, qf, qNorth, qSouth, qWest, qEast, rE);
temp[3] = qCalc(mdp, Action.LEFT, vf, qf, qNorth, qSouth, qWest, qEast, rW);
Action action = Action.UP;
Random rand = new Random();
double prob = rand.nextDouble();
double maxi = Double.NEGATIVE_INFINITY;
// The impossible states are filtered out i.e. obstacles
for (int v = 0; v < 4; v++) {
if (values[v] != Double.NEGATIVE_INFINITY) {
values[v] = temp[v];
//System.out.println(v + " " + values[v]);
}
}
// E-Greedy
// Decide randomly whether to choose the optimal policy or to perform a
// random action
if (prob < eta) {
// Choose a random action
System.out.println(prob + "<" + eta);
maxi = temp[rand.nextInt(3)];
} else {
// Select the highest Q[S',A']
for (int i = 0; i < values.length; i++) {
if (values > maxi) {
maxi = values;
action = actions;
}
}
}
double r = mdp.getReward();
//System.out.println("Max: " + maxi + " Action:" + action.toString());
// Q[S,A] <- Q[S,A] + alpha(r + gamma * max(Q[S',A']) - Q[S,A])
double q = qf.getValue(x, y) + alpha * (r + gamma * maxi - qf.getValue(x, y));
// Set the new values, set the new policy and lessen E as to lessen the
// randomness.
qf.setValue(x, y, q);
qf.setAction(x, y, action);
eta -= 0.05;
}
/**
* A separate function to calculate the Q[S',A'] value such that the
* original function is not cluttered by all of these repeated lines.
*
* @param mdp
* @param a
* @param vf
* @param qf
* @param qNorth
* @param qSouth
* @param qWest
* @param qEast
* @return the corresponding Q[S',A'] values
*/
private double qCalc(MarkovDecisionProblem mdp, Action a, VField vf, VField qf, double qNorth, double qSouth,
double qWest, double qEast, double r) {
if (a.equals(Action.UP))
return r + gamma * (fwd * qNorth + side * qEast + bck * qSouth + side * qWest);
if (a.equals(Action.DOWN))
return r + gamma * (fwd * qSouth + side * qEast + bck * qNorth + side * qWest);
if (a.equals(Action.RIGHT))
return r + gamma * (fwd * qEast + side * qSouth + side * qNorth + bck * qWest);
if (a.equals(Action.LEFT))
return r + gamma * (fwd * qWest + bck * qEast + side * qNorth + side * qSouth);
return 0.0;
}
/**
* @return the current Value Iteration Field
*/
public VField getVfield() {
return vf;
}
/**
* @return the current Q-Learning Field
*/
public VField getQfield() {
return qf;
}
/**
* @param xPos
* @param yPos
* @return the value at the given location in the VField
*/
public double getValue(int xPos, int yPos) {
return vf.getValue(xPos, yPos);
}
}