Policy iteration for reachability reward problems (explicit engine).

git-svn-id: https://www.prismmodelchecker.org/svn/prism/prism/trunk@7667 bbc10eb1-c90d-0410-af57-cb519fbb1720
13 years ago · 4d3d52edaf
6 changed files with 322 additions and 42 deletions
--- a/prism/src/explicit/DTMCFromMDPMemorylessAdversary.java
+++ b/prism/src/explicit/DTMCFromMDPMemorylessAdversary.java
@ -216,8 +216,11 @@ public class DTMCFromMDPMemorylessAdversary extends DTMCExplicit
 	public void prob1step(BitSet subset, BitSet u, BitSet v, BitSet result)
 	{
 		// TODO
 		throw new Error("Not yet supported");
 		for (int s = 0; s < numStates; s++) {
 			if (subset.get(s)) {
 				result.set(s, mdp.prob1stepSingle(s, adv[s], u, v));
 			}
 		}
 	}
 	@Override
@ -235,8 +238,7 @@ public class DTMCFromMDPMemorylessAdversary extends DTMCExplicit
 	@Override
 	public double mvMultRewSingle(int s, double vect[], MCRewards mcRewards)
 	{
 		throw new RuntimeException("Not implemented yet");
 		//return mdp.mvMultRewSingle(s, adv[s], vect);
 		return adv[s] >= 0 ? mdp.mvMultRewSingle(s, adv[s], vect, mcRewards) : 0;
 	}
 	@Override
--- a/prism/src/explicit/MDP.java
+++ b/prism/src/explicit/MDP.java
@ -32,6 +32,7 @@ import java.util.List;
 import java.util.Map.Entry;
 import prism.PrismException;
 import explicit.rewards.MCRewards;
 import explicit.rewards.MDPRewards;
 /**
@ -98,6 +99,16 @@ public interface MDP extends NondetModel
 	 */
 	public void prob1step(BitSet subset, BitSet u, BitSet v, boolean forall, BitSet result);
 	/**
 	 * Perform a single step of precomputation algorithm Prob1 for a single state/choice,
 	 * i.e., return whether there is a transition to a state in {@code v} and all transitions go to states in {@code u}.
 	 * @param s State (row) index
 	 * @param i Choice index
 	 * @param u Set of states {@code u}
 	 * @param v Set of states {@code v}
 	 */
 	public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v);
 	/**
 	 * Do a matrix-vector multiplication followed by min/max, i.e. one step of value iteration,
 	 * i.e. for all s: result[s] = min/max_k { sum_j P_k(s,j)*vect[j] }
@ -134,10 +145,10 @@ public interface MDP extends NondetModel
 	/**
 	 * Do a single row of matrix-vector multiplication for a specific choice.
 	 * @param s State (row) index
 	 * @param k Choice index
 	 * @param i Choice index
 	 * @param vect Vector to multiply by
 	 */
 	public double mvMultSingle(int s, int k, double vect[]);
 	public double mvMultSingle(int s, int i, double vect[]);
 	/**
 	 * Do a Gauss-Seidel-style matrix-vector multiplication followed by min/max.
@ -171,14 +182,14 @@ public interface MDP extends NondetModel
 	 * Do a single row of Jacobi-style matrix-vector multiplication for a specific choice.
 	 * i.e. return min/max_k { (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
 	 * @param s Row index
 	 * @param k Choice index
 	 * @param i Choice index
 	 * @param vect Vector to multiply by
 	 */
 	public double mvMultJacSingle(int s, int k, double vect[]);
 	public double mvMultJacSingle(int s, int i, double vect[]);
 	/**
 	 * Do a matrix-vector multiplication and sum of action reward followed by min/max, i.e. one step of value iteration.
 	 * i.e. for all s: result[s] = min/max_k { rew(s) + sum_j P_k(s,j)*vect[j] }
 	 * Do a matrix-vector multiplication and sum of rewards followed by min/max, i.e. one step of value iteration.
 	 * i.e. for all s: result[s] = min/max_k { rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j] }
 	 * Optionally, store optimal (memoryless) strategy info. 
 	 * @param vect Vector to multiply by
 	 * @param mdpRewards The rewards
@ -191,8 +202,30 @@ public interface MDP extends NondetModel
 	public void mvMultRewMinMax(double vect[], MDPRewards mdpRewards, boolean min, double result[], BitSet subset, boolean complement, int strat[]);
 	/**
 	 * Do a Gauss-Seidel-style matrix-vector multiplication and sum of action reward followed by min/max.
 	 * i.e. for all s: vect[s] = min/max_k { rew(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
 	 * Do a single row of matrix-vector multiplication and sum of rewards followed by min/max.
 	 * i.e. return min/max_k { rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j] }
 	 * Optionally, store optimal (memoryless) strategy info. 
 	 * @param s Row index
 	 * @param vect Vector to multiply by
 	 * @param mdpRewards The rewards
 	 * @param min Min or max for (true=min, false=max)
 	 * @param strat Storage for (memoryless) strategy choice indices (ignored if null)
 	 */
 	public double mvMultRewMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]);
 	/**
 	 * Do a single row of matrix-vector multiplication and sum of rewards for a specific choice.
 	 * i.e. rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j]
 	 * @param s State (row) index
 	 * @param i Choice index
 	 * @param vect Vector to multiply by
 	 * @param mcRewards The rewards
 	 */
 	public double mvMultRewSingle(int s, int i, double vect[], MCRewards mcRewards);
 	/**
 	 * Do a Gauss-Seidel-style matrix-vector multiplication and sum of rewards followed by min/max.
 	 * i.e. for all s: vect[s] = min/max_k { rew(s) + rew_k(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
 	 * and store new values directly in {@code vect} as computed.
 	 * The maximum (absolute/relative) difference between old/new
 	 * elements of {@code vect} is also returned.
@ -209,22 +242,10 @@ public interface MDP extends NondetModel
 	public double mvMultRewGSMinMax(double vect[], MDPRewards mdpRewards, boolean min, BitSet subset, boolean complement, boolean absolute, int strat[]);
 	/**
 	 * Do a single row of matrix-vector multiplication and sum of action reward followed by min/max.
 	 * i.e. return min/max_k { rew(s) + sum_j P_k(s,j)*vect[j] }
 	 * Do a single row of Jacobi-style matrix-vector multiplication and sum of rewards followed by min/max.
 	 * i.e. return min/max_k { rew(s) + rew_k(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
 	 * Optionally, store optimal (memoryless) strategy info. 
 	 * @param s Row index
 	 * @param vect Vector to multiply by
 	 * @param mdpRewards The rewards
 	 * @param min Min or max for (true=min, false=max)
 	 * @param strat Storage for (memoryless) strategy choice indices (ignored if null)
 	 */
 	public double mvMultRewMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]);
 	/**
 	 * Do a single row of Jacobi-style matrix-vector multiplication and sum of action reward followed by min/max.
 	 * i.e. return min/max_k { (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
 	 * Optionally, store optimal (memoryless) strategy info. 
 	 * @param s Row index
 	 * @param s State (row) index
 	 * @param vect Vector to multiply by
 	 * @param mdpRewards The rewards
 	 * @param min Min or max for (true=min, false=max)
@ -233,8 +254,8 @@ public interface MDP extends NondetModel
 	public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]);
 	/**
 	 * Determine which choices result in min/max after a single row of matrix-vector multiplication and sum of action reward.
 	 * @param s Row index
 	 * Determine which choices result in min/max after a single row of matrix-vector multiplication and sum of rewards.
 	 * @param s State (row) index
 	 * @param vect Vector to multiply by
 	 * @param mdpRewards The rewards
 	 * @param min Min or max (true=min, false=max)
--- a/prism/src/explicit/MDPModelChecker.java
+++ b/prism/src/explicit/MDPModelChecker.java
@ -47,6 +47,8 @@ import prism.PrismLangException;
 import prism.PrismLog;
 import prism.PrismUtils;
 import strat.MDStrategyArray;
 import explicit.rewards.MCRewards;
 import explicit.rewards.MCRewardsFromMDPRewards;
 import explicit.rewards.MDPRewards;
 /**
@ -980,6 +982,7 @@ public class MDPModelChecker extends ProbModelChecker
 	/**
 	 * Compute reachability probabilities using policy iteration.
 	 * Optionally, store optimal (memoryless) strategy info. 
 	 * @param mdp: The MDP
 	 * @param no: Probability 0 states
 	 * @param yes: Probability 1 states
@ -999,7 +1002,7 @@ public class MDPModelChecker extends ProbModelChecker
 		// Re-use solution to solve each new policy (strategy)?
 		boolean reUseSoln = true;
 		// Start value iteration
 		// Start policy iteration
 		timer = System.currentTimeMillis();
 		mainLog.println("Starting policy iteration (" + (min ? "min" : "max") + ")...");
@ -1393,7 +1396,7 @@ public class MDPModelChecker extends ProbModelChecker
 		MDPSolnMethod mdpSolnMethod = this.mdpSolnMethod;
 		// Switch to a supported method, if necessary
 		if (!(mdpSolnMethod == MDPSolnMethod.VALUE_ITERATION || mdpSolnMethod == MDPSolnMethod.GAUSS_SEIDEL)) {
 		if (!(mdpSolnMethod == MDPSolnMethod.VALUE_ITERATION || mdpSolnMethod == MDPSolnMethod.GAUSS_SEIDEL || mdpSolnMethod == MDPSolnMethod.POLICY_ITERATION)) {
 			mdpSolnMethod = MDPSolnMethod.GAUSS_SEIDEL;
 			mainLog.printWarning("Switching to MDP solution method \"" + mdpSolnMethod.fullName() + "\"");
 		}
@ -1469,6 +1472,9 @@ public class MDPModelChecker extends ProbModelChecker
 		case GAUSS_SEIDEL:
 			res = computeReachRewardsGaussSeidel(mdp, mdpRewards, target, inf, min, init, known, strat);
 			break;
 		case POLICY_ITERATION:
 			res = computeReachRewardsPolIter(mdp, mdpRewards, target, inf, min, strat);
 			break;
 		default:
 			throw new PrismException("Unknown MDP solution method " + mdpSolnMethod.fullName());
 		}
@ -1684,6 +1690,124 @@ public class MDPModelChecker extends ProbModelChecker
 		return res;
 	}
 	/**
 	 * Compute expected reachability rewards using policy iteration.
 	 * Optionally, store optimal (memoryless) strategy info. 
 	 * @param mdp The MDP
 	 * @param mdpRewards The rewards
 	 * @param target Target states
 	 * @param inf States for which reward is infinite
 	 * @param min Min or max rewards (true=min, false=max)
 	 * @param strat Storage for (memoryless) strategy choice indices (ignored if null)
 	 */
 	protected ModelCheckerResult computeReachRewardsPolIter(MDP mdp, MDPRewards mdpRewards, BitSet target, BitSet inf, boolean min, int strat[])
 			throws PrismException
 	{
 		ModelCheckerResult res;
 		int i, n, iters, totalIters;
 		double soln[], soln2[];
 		boolean done;
 		long timer;
 		DTMCModelChecker mcDTMC;
 		DTMC dtmc;
 		MCRewards mcRewards;
 		// Re-use solution to solve each new policy (strategy)?
 		boolean reUseSoln = true;
 		// Start policy iteration
 		timer = System.currentTimeMillis();
 		mainLog.println("Starting policy iteration (" + (min ? "min" : "max") + ")...");
 		// Create a DTMC model checker (for solving policies)
 		mcDTMC = new DTMCModelChecker(this);
 		mcDTMC.inheritSettings(this);
 		mcDTMC.setLog(new PrismDevNullLog());
 		// Store num states
 		n = mdp.getNumStates();
 		// Create solution vector(s)
 		soln = new double[n];
 		soln2 = new double[n];
 		// Initialise solution vectors.
 		for (i = 0; i < n; i++)
 			soln[i] = soln2[i] = target.get(i) ? 0.0 : inf.get(i) ? Double.POSITIVE_INFINITY : 0.0;
 		// If not passed in, create new storage for strategy and initialise
 		// Initial strategy just picks first choice (0) everywhere
 		if (strat == null) {
 			strat = new int[n];
 			for (i = 0; i < n; i++)
 				strat[i] = 0;
 		}
 		// Otherwise, just initialise for states not in target/inf
 		// (Optimal choices for target/inf should already be known)
 		else {
 			for (i = 0; i < n; i++)
 				if (!(target.get(i) || inf.get(i)))
 					strat[i] = 0;
 		}
 		// For minimum rewards, we need to make sure that initial strategy choices
 		// do not result in infinite rewards for any states that are know not to be infinite
 		// (otherwise policy iteration may not converge) 
 		if (min) {
 			for (i = 0; i < n; i++) {
 				if (!(target.get(i) || inf.get(i))) {
 					int numChoices = mdp.getNumChoices(i);
 					for (int k = 0; k < numChoices; k++) {
 						if (!mdp.someSuccessorsInSet(i, k, inf)) {
 							strat[i] = k;
 							continue;
 						}
 					}
 				}
 			}
 		}
 		// Start iterations
 		iters = totalIters = 0;
 		done = false;
 		while (!done && iters < maxIters) {
 			iters++;
 			// Solve induced DTMC for strategy
 			dtmc = new DTMCFromMDPMemorylessAdversary(mdp, strat);
 			mcRewards = new MCRewardsFromMDPRewards(mdpRewards, strat);
 			res = mcDTMC.computeReachRewards(dtmc, mcRewards, target, reUseSoln ? soln : null, null);
 			soln = res.soln;
 			totalIters += res.numIters;
 			// Check if optimal, improve non-optimal choices
 			mdp.mvMultRewMinMax(soln, mdpRewards, min, soln2, null, false, null);
 			done = true;
 			for (i = 0; i < n; i++) {
 				// Don't look at target/inf states - we may not have strategy info for them,
 				// so they might appear non-optimal
 				if (target.get(i) || inf.get(i))
 					continue;
 				if (!PrismUtils.doublesAreClose(soln[i], soln2[i], termCritParam, termCrit == TermCrit.ABSOLUTE)) {
 					done = false;
 					List<Integer> opt = mdp.mvMultRewMinMaxSingleChoices(i, soln, mdpRewards, min, soln2[i]);
 					// Only update strategy if strictly better
 					if (!opt.contains(strat[i]))
 						strat[i] = opt.get(0);
 				}
 			}
 		}
 		// Finished policy iteration
 		timer = System.currentTimeMillis() - timer;
 		mainLog.print("Policy iteration");
 		mainLog.println(" took " + iters + " cycles (" + totalIters + " iterations in total) and " + timer / 1000.0 + " seconds.");
 		// Return results
 		res = new ModelCheckerResult();
 		res.soln = soln;
 		res.numIters = totalIters;
 		res.timeTaken = timer / 1000.0;
 		return res;
 	}
 	/**
 	 * Construct strategy information for min/max expected reachability.
 	 * (More precisely, list of indices of choices resulting in min/max.)
--- a/prism/src/explicit/MDPSimple.java
+++ b/prism/src/explicit/MDPSimple.java
@ -41,6 +41,7 @@ import java.util.Map.Entry;
 import prism.PrismException;
 import prism.PrismUtils;
 import explicit.rewards.MCRewards;
 import explicit.rewards.MDPRewards;
 /**
@ -671,6 +672,13 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
 		}
 	}
 	@Override
 	public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v)
 	{
 		Distribution distr = trans.get(s).get(i);
 		return distr.containsOneOf(v) && distr.isSubsetOf(u);
 	}
 	@Override
 	public double mvMultMinMaxSingle(int s, double vect[], boolean min, int strat[])
 	{
@ -748,11 +756,12 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
 	}
 	@Override
 	public double mvMultSingle(int s, int k, double vect[])
 	public double mvMultSingle(int s, int i, double vect[])
 	{
 		double d, prob;
 		int k;
 		Distribution distr = trans.get(s).get(k);
 		Distribution distr = trans.get(s).get(i);
 		// Compute sum for this distribution
 		d = 0.0;
 		for (Map.Entry<Integer, Double> e : distr) {
@ -816,19 +825,20 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
 	}
 	@Override
 	public double mvMultJacSingle(int s, int k, double vect[])
 	public double mvMultJacSingle(int s, int i, double vect[])
 	{
 		double diag, d, prob;
 		int k;
 		Distribution distr;
 		distr = trans.get(s).get(k);
 		distr = trans.get(s).get(i);
 		diag = 1.0;
 		// Compute sum for this distribution
 		d = 0.0;
 		for (Map.Entry<Integer, Double> e : distr) {
 			k = (Integer) e.getKey();
 			prob = (Double) e.getValue();
 			if (k != s) {
 			if (i != s) {
 				d += prob * vect[k];
 			} else {
 				diag -= prob;
@ -885,6 +895,27 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
 		return minmax;
 	}
 	@Override
 	public double mvMultRewSingle(int s, int i, double[] vect, MCRewards mcRewards)
 	{
 		double d, prob;
 		int k;
 		Distribution distr = trans.get(s).get(i);
 		// Compute sum for this distribution
 		// TODO: use transition rewards when added to DTMCss
 		// d = mcRewards.getTransitionReward(s);
 		d = 0;
 		for (Map.Entry<Integer, Double> e : distr) {
 			k = (Integer) e.getKey();
 			prob = (Double) e.getValue();
 			d += prob * vect[k];
 		}
 		d += mcRewards.getStateReward(s);
 		return d;
 	}
 	@Override
 	public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[])
 	{
--- a/prism/src/explicit/MDPSparse.java
+++ b/prism/src/explicit/MDPSparse.java
@ -43,6 +43,7 @@ import java.util.TreeMap;
 import parser.State;
 import prism.PrismException;
 import prism.PrismUtils;
 import explicit.rewards.MCRewards;
 import explicit.rewards.MDPRewards;
 /**
@ -754,6 +755,29 @@ public class MDPSparse extends MDPExplicit
 		}
 	}
 	@Override
 	public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v)
 	{
 		int j, k, l2, h2;
 		boolean some, all;
 		j = rowStarts[s] + i;
 		some = false;
 		all = true;
 		l2 = choiceStarts[j];
 		h2 = choiceStarts[j + 1];
 		for (k = l2; k < h2; k++) {
 			// Assume that only non-zero entries are stored
 			if (v.get(cols[k])) {
 				some = true;
 			}
 			if (!u.get(cols[k])) {
 				all = false;
 			}
 		}
 		return some && all;
 	}
 	@Override
 	public double mvMultMinMaxSingle(int s, double vect[], boolean min, int strat[])
 	{
@ -825,12 +849,12 @@ public class MDPSparse extends MDPExplicit
 	}
 	@Override
 	public double mvMultSingle(int s, int k, double vect[])
 	public double mvMultSingle(int s, int i, double vect[])
 	{
 		int j, l2, h2;
 		int j, k, l2, h2;
 		double d;
 		j = rowStarts[s] + k;
 		j = rowStarts[s] + i;
 		// Compute sum for this distribution
 		d = 0.0;
 		l2 = choiceStarts[j];
@ -891,12 +915,12 @@ public class MDPSparse extends MDPExplicit
 	}
 	@Override
 	public double mvMultJacSingle(int s, int k, double vect[])
 	public double mvMultJacSingle(int s, int i, double vect[])
 	{
 		int j, l2, h2;
 		int j, k, l2, h2;
 		double diag, d;
 		j = rowStarts[s] + k;
 		j = rowStarts[s] + i;
 		diag = 1.0;
 		// Compute sum for this distribution
 		d = 0.0;
@ -958,6 +982,26 @@ public class MDPSparse extends MDPExplicit
 		return minmax;
 	}
 	@Override
 	public double mvMultRewSingle(int s, int i, double[] vect, MCRewards mcRewards)
 	{
 		int j, k, l2, h2;
 		double d;
 		j = rowStarts[s] + i;
 		// Compute sum for this distribution
 		// TODO: use transition rewards when added to DTMCss
 		// d = mcRewards.getTransitionReward(s);
 		d = 0;
 		l2 = choiceStarts[j];
 		h2 = choiceStarts[j + 1];
 		for (k = l2; k < h2; k++) {
 			d += nonZeros[k] * vect[cols[k]];
 		}
 		d += mcRewards.getStateReward(s);
 		return d;
 	}
 	@Override
 	public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[])
 	{
--- a/prism/src/explicit/rewards/MCRewardsFromMDPRewards.java
+++ b/prism/src/explicit/rewards/MCRewardsFromMDPRewards.java
@ -0,0 +1,58 @@
 //==============================================================================
 //	
 //	Copyright (c) 2002-
 //	Authors:
 //	* Dave Parker <d.a.parker@cs.bham.ac.uk> (University of Birmingham/Oxford)
 //	
 //------------------------------------------------------------------------------
 //	
 //	This file is part of PRISM.
 //	
 //	PRISM is free software; you can redistribute it and/or modify
 //	it under the terms of the GNU General Public License as published by
 //	the Free Software Foundation; either version 2 of the License, or
 //	(at your option) any later version.
 //	
 //	PRISM is distributed in the hope that it will be useful,
 //	but WITHOUT ANY WARRANTY; without even the implied warranty of
 //	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 //	GNU General Public License for more details.
 //	
 //	You should have received a copy of the GNU General Public License
 //	along with PRISM; if not, write to the Free Software Foundation,
 //	Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 //	
 //==============================================================================
 package explicit.rewards;
 /**
 * Explicit-state representation of a DTMC rewards structure, constructed (implicitly)
 * from an MDP rewards structure and a memoryless deterministic strategy, specified as an array of integer indices.
 * This class is read-only: most of data is pointers to other model info.
 */
 public class MCRewardsFromMDPRewards implements MCRewards
 {
 	// MDP rewards
 	protected MDPRewards mdpRewards;
 	// Strategy (array of choice indices; -1 denotes no choice)
 	protected int strat[];
 	/**
 	 * Constructor: create from MDP rewards and memoryless adversary.
 	 */
 	public MCRewardsFromMDPRewards(MDPRewards mdpRewards, int strat[])
 	{
 		this.mdpRewards = mdpRewards;
 		this.strat = strat;
 	}
 	@Override
 	public double getStateReward(int s)
 	{
 		// For now, state/transition rewards from MDP are both put into state reward
 		// This works fine for cumulative rewards, but not instantaneous ones
 		return mdpRewards.getStateReward(s) + mdpRewards.getTransitionReward(s, strat[s]);
 	}
 }