Browse Source

Policy iteration for reachability reward problems (explicit engine).

git-svn-id: https://www.prismmodelchecker.org/svn/prism/prism/trunk@7667 bbc10eb1-c90d-0410-af57-cb519fbb1720
master
Dave Parker 12 years ago
parent
commit
4d3d52edaf
  1. 10
      prism/src/explicit/DTMCFromMDPMemorylessAdversary.java
  2. 71
      prism/src/explicit/MDP.java
  3. 128
      prism/src/explicit/MDPModelChecker.java
  4. 41
      prism/src/explicit/MDPSimple.java
  5. 56
      prism/src/explicit/MDPSparse.java
  6. 58
      prism/src/explicit/rewards/MCRewardsFromMDPRewards.java

10
prism/src/explicit/DTMCFromMDPMemorylessAdversary.java

@ -216,8 +216,11 @@ public class DTMCFromMDPMemorylessAdversary extends DTMCExplicit
public void prob1step(BitSet subset, BitSet u, BitSet v, BitSet result)
{
// TODO
throw new Error("Not yet supported");
for (int s = 0; s < numStates; s++) {
if (subset.get(s)) {
result.set(s, mdp.prob1stepSingle(s, adv[s], u, v));
}
}
}
@Override
@ -235,8 +238,7 @@ public class DTMCFromMDPMemorylessAdversary extends DTMCExplicit
@Override
public double mvMultRewSingle(int s, double vect[], MCRewards mcRewards)
{
throw new RuntimeException("Not implemented yet");
//return mdp.mvMultRewSingle(s, adv[s], vect);
return adv[s] >= 0 ? mdp.mvMultRewSingle(s, adv[s], vect, mcRewards) : 0;
}
@Override

71
prism/src/explicit/MDP.java

@ -32,6 +32,7 @@ import java.util.List;
import java.util.Map.Entry;
import prism.PrismException;
import explicit.rewards.MCRewards;
import explicit.rewards.MDPRewards;
/**
@ -98,6 +99,16 @@ public interface MDP extends NondetModel
*/
public void prob1step(BitSet subset, BitSet u, BitSet v, boolean forall, BitSet result);
/**
* Perform a single step of precomputation algorithm Prob1 for a single state/choice,
* i.e., return whether there is a transition to a state in {@code v} and all transitions go to states in {@code u}.
* @param s State (row) index
* @param i Choice index
* @param u Set of states {@code u}
* @param v Set of states {@code v}
*/
public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v);
/**
* Do a matrix-vector multiplication followed by min/max, i.e. one step of value iteration,
* i.e. for all s: result[s] = min/max_k { sum_j P_k(s,j)*vect[j] }
@ -134,10 +145,10 @@ public interface MDP extends NondetModel
/**
* Do a single row of matrix-vector multiplication for a specific choice.
* @param s State (row) index
* @param k Choice index
* @param i Choice index
* @param vect Vector to multiply by
*/
public double mvMultSingle(int s, int k, double vect[]);
public double mvMultSingle(int s, int i, double vect[]);
/**
* Do a Gauss-Seidel-style matrix-vector multiplication followed by min/max.
@ -171,14 +182,14 @@ public interface MDP extends NondetModel
* Do a single row of Jacobi-style matrix-vector multiplication for a specific choice.
* i.e. return min/max_k { (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
* @param s Row index
* @param k Choice index
* @param i Choice index
* @param vect Vector to multiply by
*/
public double mvMultJacSingle(int s, int k, double vect[]);
public double mvMultJacSingle(int s, int i, double vect[]);
/**
* Do a matrix-vector multiplication and sum of action reward followed by min/max, i.e. one step of value iteration.
* i.e. for all s: result[s] = min/max_k { rew(s) + sum_j P_k(s,j)*vect[j] }
* Do a matrix-vector multiplication and sum of rewards followed by min/max, i.e. one step of value iteration.
* i.e. for all s: result[s] = min/max_k { rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j] }
* Optionally, store optimal (memoryless) strategy info.
* @param vect Vector to multiply by
* @param mdpRewards The rewards
@ -191,8 +202,30 @@ public interface MDP extends NondetModel
public void mvMultRewMinMax(double vect[], MDPRewards mdpRewards, boolean min, double result[], BitSet subset, boolean complement, int strat[]);
/**
* Do a Gauss-Seidel-style matrix-vector multiplication and sum of action reward followed by min/max.
* i.e. for all s: vect[s] = min/max_k { rew(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
* Do a single row of matrix-vector multiplication and sum of rewards followed by min/max.
* i.e. return min/max_k { rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j] }
* Optionally, store optimal (memoryless) strategy info.
* @param s Row index
* @param vect Vector to multiply by
* @param mdpRewards The rewards
* @param min Min or max for (true=min, false=max)
* @param strat Storage for (memoryless) strategy choice indices (ignored if null)
*/
public double mvMultRewMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]);
/**
* Do a single row of matrix-vector multiplication and sum of rewards for a specific choice.
* i.e. rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j]
* @param s State (row) index
* @param i Choice index
* @param vect Vector to multiply by
* @param mcRewards The rewards
*/
public double mvMultRewSingle(int s, int i, double vect[], MCRewards mcRewards);
/**
* Do a Gauss-Seidel-style matrix-vector multiplication and sum of rewards followed by min/max.
* i.e. for all s: vect[s] = min/max_k { rew(s) + rew_k(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
* and store new values directly in {@code vect} as computed.
* The maximum (absolute/relative) difference between old/new
* elements of {@code vect} is also returned.
@ -209,22 +242,10 @@ public interface MDP extends NondetModel
public double mvMultRewGSMinMax(double vect[], MDPRewards mdpRewards, boolean min, BitSet subset, boolean complement, boolean absolute, int strat[]);
/**
* Do a single row of matrix-vector multiplication and sum of action reward followed by min/max.
* i.e. return min/max_k { rew(s) + sum_j P_k(s,j)*vect[j] }
* Do a single row of Jacobi-style matrix-vector multiplication and sum of rewards followed by min/max.
* i.e. return min/max_k { rew(s) + rew_k(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
* Optionally, store optimal (memoryless) strategy info.
* @param s Row index
* @param vect Vector to multiply by
* @param mdpRewards The rewards
* @param min Min or max for (true=min, false=max)
* @param strat Storage for (memoryless) strategy choice indices (ignored if null)
*/
public double mvMultRewMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]);
/**
* Do a single row of Jacobi-style matrix-vector multiplication and sum of action reward followed by min/max.
* i.e. return min/max_k { (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
* Optionally, store optimal (memoryless) strategy info.
* @param s Row index
* @param s State (row) index
* @param vect Vector to multiply by
* @param mdpRewards The rewards
* @param min Min or max for (true=min, false=max)
@ -233,8 +254,8 @@ public interface MDP extends NondetModel
public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]);
/**
* Determine which choices result in min/max after a single row of matrix-vector multiplication and sum of action reward.
* @param s Row index
* Determine which choices result in min/max after a single row of matrix-vector multiplication and sum of rewards.
* @param s State (row) index
* @param vect Vector to multiply by
* @param mdpRewards The rewards
* @param min Min or max (true=min, false=max)

128
prism/src/explicit/MDPModelChecker.java

@ -47,6 +47,8 @@ import prism.PrismLangException;
import prism.PrismLog;
import prism.PrismUtils;
import strat.MDStrategyArray;
import explicit.rewards.MCRewards;
import explicit.rewards.MCRewardsFromMDPRewards;
import explicit.rewards.MDPRewards;
/**
@ -980,6 +982,7 @@ public class MDPModelChecker extends ProbModelChecker
/**
* Compute reachability probabilities using policy iteration.
* Optionally, store optimal (memoryless) strategy info.
* @param mdp: The MDP
* @param no: Probability 0 states
* @param yes: Probability 1 states
@ -999,7 +1002,7 @@ public class MDPModelChecker extends ProbModelChecker
// Re-use solution to solve each new policy (strategy)?
boolean reUseSoln = true;
// Start value iteration
// Start policy iteration
timer = System.currentTimeMillis();
mainLog.println("Starting policy iteration (" + (min ? "min" : "max") + ")...");
@ -1393,7 +1396,7 @@ public class MDPModelChecker extends ProbModelChecker
MDPSolnMethod mdpSolnMethod = this.mdpSolnMethod;
// Switch to a supported method, if necessary
if (!(mdpSolnMethod == MDPSolnMethod.VALUE_ITERATION || mdpSolnMethod == MDPSolnMethod.GAUSS_SEIDEL)) {
if (!(mdpSolnMethod == MDPSolnMethod.VALUE_ITERATION || mdpSolnMethod == MDPSolnMethod.GAUSS_SEIDEL || mdpSolnMethod == MDPSolnMethod.POLICY_ITERATION)) {
mdpSolnMethod = MDPSolnMethod.GAUSS_SEIDEL;
mainLog.printWarning("Switching to MDP solution method \"" + mdpSolnMethod.fullName() + "\"");
}
@ -1469,6 +1472,9 @@ public class MDPModelChecker extends ProbModelChecker
case GAUSS_SEIDEL:
res = computeReachRewardsGaussSeidel(mdp, mdpRewards, target, inf, min, init, known, strat);
break;
case POLICY_ITERATION:
res = computeReachRewardsPolIter(mdp, mdpRewards, target, inf, min, strat);
break;
default:
throw new PrismException("Unknown MDP solution method " + mdpSolnMethod.fullName());
}
@ -1684,6 +1690,124 @@ public class MDPModelChecker extends ProbModelChecker
return res;
}
/**
* Compute expected reachability rewards using policy iteration.
* Optionally, store optimal (memoryless) strategy info.
* @param mdp The MDP
* @param mdpRewards The rewards
* @param target Target states
* @param inf States for which reward is infinite
* @param min Min or max rewards (true=min, false=max)
* @param strat Storage for (memoryless) strategy choice indices (ignored if null)
*/
protected ModelCheckerResult computeReachRewardsPolIter(MDP mdp, MDPRewards mdpRewards, BitSet target, BitSet inf, boolean min, int strat[])
throws PrismException
{
ModelCheckerResult res;
int i, n, iters, totalIters;
double soln[], soln2[];
boolean done;
long timer;
DTMCModelChecker mcDTMC;
DTMC dtmc;
MCRewards mcRewards;
// Re-use solution to solve each new policy (strategy)?
boolean reUseSoln = true;
// Start policy iteration
timer = System.currentTimeMillis();
mainLog.println("Starting policy iteration (" + (min ? "min" : "max") + ")...");
// Create a DTMC model checker (for solving policies)
mcDTMC = new DTMCModelChecker(this);
mcDTMC.inheritSettings(this);
mcDTMC.setLog(new PrismDevNullLog());
// Store num states
n = mdp.getNumStates();
// Create solution vector(s)
soln = new double[n];
soln2 = new double[n];
// Initialise solution vectors.
for (i = 0; i < n; i++)
soln[i] = soln2[i] = target.get(i) ? 0.0 : inf.get(i) ? Double.POSITIVE_INFINITY : 0.0;
// If not passed in, create new storage for strategy and initialise
// Initial strategy just picks first choice (0) everywhere
if (strat == null) {
strat = new int[n];
for (i = 0; i < n; i++)
strat[i] = 0;
}
// Otherwise, just initialise for states not in target/inf
// (Optimal choices for target/inf should already be known)
else {
for (i = 0; i < n; i++)
if (!(target.get(i) || inf.get(i)))
strat[i] = 0;
}
// For minimum rewards, we need to make sure that initial strategy choices
// do not result in infinite rewards for any states that are know not to be infinite
// (otherwise policy iteration may not converge)
if (min) {
for (i = 0; i < n; i++) {
if (!(target.get(i) || inf.get(i))) {
int numChoices = mdp.getNumChoices(i);
for (int k = 0; k < numChoices; k++) {
if (!mdp.someSuccessorsInSet(i, k, inf)) {
strat[i] = k;
continue;
}
}
}
}
}
// Start iterations
iters = totalIters = 0;
done = false;
while (!done && iters < maxIters) {
iters++;
// Solve induced DTMC for strategy
dtmc = new DTMCFromMDPMemorylessAdversary(mdp, strat);
mcRewards = new MCRewardsFromMDPRewards(mdpRewards, strat);
res = mcDTMC.computeReachRewards(dtmc, mcRewards, target, reUseSoln ? soln : null, null);
soln = res.soln;
totalIters += res.numIters;
// Check if optimal, improve non-optimal choices
mdp.mvMultRewMinMax(soln, mdpRewards, min, soln2, null, false, null);
done = true;
for (i = 0; i < n; i++) {
// Don't look at target/inf states - we may not have strategy info for them,
// so they might appear non-optimal
if (target.get(i) || inf.get(i))
continue;
if (!PrismUtils.doublesAreClose(soln[i], soln2[i], termCritParam, termCrit == TermCrit.ABSOLUTE)) {
done = false;
List<Integer> opt = mdp.mvMultRewMinMaxSingleChoices(i, soln, mdpRewards, min, soln2[i]);
// Only update strategy if strictly better
if (!opt.contains(strat[i]))
strat[i] = opt.get(0);
}
}
}
// Finished policy iteration
timer = System.currentTimeMillis() - timer;
mainLog.print("Policy iteration");
mainLog.println(" took " + iters + " cycles (" + totalIters + " iterations in total) and " + timer / 1000.0 + " seconds.");
// Return results
res = new ModelCheckerResult();
res.soln = soln;
res.numIters = totalIters;
res.timeTaken = timer / 1000.0;
return res;
}
/**
* Construct strategy information for min/max expected reachability.
* (More precisely, list of indices of choices resulting in min/max.)

41
prism/src/explicit/MDPSimple.java

@ -41,6 +41,7 @@ import java.util.Map.Entry;
import prism.PrismException;
import prism.PrismUtils;
import explicit.rewards.MCRewards;
import explicit.rewards.MDPRewards;
/**
@ -671,6 +672,13 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
}
}
@Override
public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v)
{
Distribution distr = trans.get(s).get(i);
return distr.containsOneOf(v) && distr.isSubsetOf(u);
}
@Override
public double mvMultMinMaxSingle(int s, double vect[], boolean min, int strat[])
{
@ -748,11 +756,12 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
}
@Override
public double mvMultSingle(int s, int k, double vect[])
public double mvMultSingle(int s, int i, double vect[])
{
double d, prob;
int k;
Distribution distr = trans.get(s).get(k);
Distribution distr = trans.get(s).get(i);
// Compute sum for this distribution
d = 0.0;
for (Map.Entry<Integer, Double> e : distr) {
@ -816,19 +825,20 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
}
@Override
public double mvMultJacSingle(int s, int k, double vect[])
public double mvMultJacSingle(int s, int i, double vect[])
{
double diag, d, prob;
int k;
Distribution distr;
distr = trans.get(s).get(k);
distr = trans.get(s).get(i);
diag = 1.0;
// Compute sum for this distribution
d = 0.0;
for (Map.Entry<Integer, Double> e : distr) {
k = (Integer) e.getKey();
prob = (Double) e.getValue();
if (k != s) {
if (i != s) {
d += prob * vect[k];
} else {
diag -= prob;
@ -885,6 +895,27 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
return minmax;
}
@Override
public double mvMultRewSingle(int s, int i, double[] vect, MCRewards mcRewards)
{
double d, prob;
int k;
Distribution distr = trans.get(s).get(i);
// Compute sum for this distribution
// TODO: use transition rewards when added to DTMCss
// d = mcRewards.getTransitionReward(s);
d = 0;
for (Map.Entry<Integer, Double> e : distr) {
k = (Integer) e.getKey();
prob = (Double) e.getValue();
d += prob * vect[k];
}
d += mcRewards.getStateReward(s);
return d;
}
@Override
public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[])
{

56
prism/src/explicit/MDPSparse.java

@ -43,6 +43,7 @@ import java.util.TreeMap;
import parser.State;
import prism.PrismException;
import prism.PrismUtils;
import explicit.rewards.MCRewards;
import explicit.rewards.MDPRewards;
/**
@ -754,6 +755,29 @@ public class MDPSparse extends MDPExplicit
}
}
@Override
public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v)
{
int j, k, l2, h2;
boolean some, all;
j = rowStarts[s] + i;
some = false;
all = true;
l2 = choiceStarts[j];
h2 = choiceStarts[j + 1];
for (k = l2; k < h2; k++) {
// Assume that only non-zero entries are stored
if (v.get(cols[k])) {
some = true;
}
if (!u.get(cols[k])) {
all = false;
}
}
return some && all;
}
@Override
public double mvMultMinMaxSingle(int s, double vect[], boolean min, int strat[])
{
@ -825,12 +849,12 @@ public class MDPSparse extends MDPExplicit
}
@Override
public double mvMultSingle(int s, int k, double vect[])
public double mvMultSingle(int s, int i, double vect[])
{
int j, l2, h2;
int j, k, l2, h2;
double d;
j = rowStarts[s] + k;
j = rowStarts[s] + i;
// Compute sum for this distribution
d = 0.0;
l2 = choiceStarts[j];
@ -891,12 +915,12 @@ public class MDPSparse extends MDPExplicit
}
@Override
public double mvMultJacSingle(int s, int k, double vect[])
public double mvMultJacSingle(int s, int i, double vect[])
{
int j, l2, h2;
int j, k, l2, h2;
double diag, d;
j = rowStarts[s] + k;
j = rowStarts[s] + i;
diag = 1.0;
// Compute sum for this distribution
d = 0.0;
@ -958,6 +982,26 @@ public class MDPSparse extends MDPExplicit
return minmax;
}
@Override
public double mvMultRewSingle(int s, int i, double[] vect, MCRewards mcRewards)
{
int j, k, l2, h2;
double d;
j = rowStarts[s] + i;
// Compute sum for this distribution
// TODO: use transition rewards when added to DTMCss
// d = mcRewards.getTransitionReward(s);
d = 0;
l2 = choiceStarts[j];
h2 = choiceStarts[j + 1];
for (k = l2; k < h2; k++) {
d += nonZeros[k] * vect[cols[k]];
}
d += mcRewards.getStateReward(s);
return d;
}
@Override
public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[])
{

58
prism/src/explicit/rewards/MCRewardsFromMDPRewards.java

@ -0,0 +1,58 @@
//==============================================================================
//
// Copyright (c) 2002-
// Authors:
// * Dave Parker <d.a.parker@cs.bham.ac.uk> (University of Birmingham/Oxford)
//
//------------------------------------------------------------------------------
//
// This file is part of PRISM.
//
// PRISM is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// PRISM is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with PRISM; if not, write to the Free Software Foundation,
// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//==============================================================================
package explicit.rewards;
/**
* Explicit-state representation of a DTMC rewards structure, constructed (implicitly)
* from an MDP rewards structure and a memoryless deterministic strategy, specified as an array of integer indices.
* This class is read-only: most of data is pointers to other model info.
*/
public class MCRewardsFromMDPRewards implements MCRewards
{
// MDP rewards
protected MDPRewards mdpRewards;
// Strategy (array of choice indices; -1 denotes no choice)
protected int strat[];
/**
* Constructor: create from MDP rewards and memoryless adversary.
*/
public MCRewardsFromMDPRewards(MDPRewards mdpRewards, int strat[])
{
this.mdpRewards = mdpRewards;
this.strat = strat;
}
@Override
public double getStateReward(int s)
{
// For now, state/transition rewards from MDP are both put into state reward
// This works fine for cumulative rewards, but not instantaneous ones
return mdpRewards.getStateReward(s) + mdpRewards.getTransitionReward(s, strat[s]);
}
}
Loading…
Cancel
Save