Policy iteration for reachability reward problems (explicit engine).

git-svn-id: https://www.prismmodelchecker.org/svn/prism/prism/trunk@7667 bbc10eb1-c90d-0410-af57-cb519fbb1720
12 years ago · 4d3d52edaf
6 changed files with 322 additions and 42 deletions
--- a/prism/src/explicit/DTMCFromMDPMemorylessAdversary.java
+++ b/prism/src/explicit/DTMCFromMDPMemorylessAdversary.java
@ -216,8 +216,11 @@ public class DTMCFromMDPMemorylessAdversary extends DTMCExplicit

 	public void prob1step(BitSet subset, BitSet u, BitSet v, BitSet result)
 	{
-		// TODO
-		throw new Error("Not yet supported");
+		for (int s = 0; s < numStates; s++) {
+			if (subset.get(s)) {
+				result.set(s, mdp.prob1stepSingle(s, adv[s], u, v));
+			}
+		}
 	}

 	@Override
@ -235,8 +238,7 @@ public class DTMCFromMDPMemorylessAdversary extends DTMCExplicit
 	@Override
 	public double mvMultRewSingle(int s, double vect[], MCRewards mcRewards)
 	{
-		throw new RuntimeException("Not implemented yet");
-		//return mdp.mvMultRewSingle(s, adv[s], vect);
+		return adv[s] >= 0 ? mdp.mvMultRewSingle(s, adv[s], vect, mcRewards) : 0;
 	}

 	@Override
--- a/prism/src/explicit/MDP.java
+++ b/prism/src/explicit/MDP.java
@ -32,6 +32,7 @@ import java.util.List;
 import java.util.Map.Entry;

 import prism.PrismException;
+import explicit.rewards.MCRewards;
 import explicit.rewards.MDPRewards;

 /**
@ -98,6 +99,16 @@ public interface MDP extends NondetModel
 	 */
 	public void prob1step(BitSet subset, BitSet u, BitSet v, boolean forall, BitSet result);

+	/**
+	 * Perform a single step of precomputation algorithm Prob1 for a single state/choice,
+	 * i.e., return whether there is a transition to a state in {@code v} and all transitions go to states in {@code u}.
+	 * @param s State (row) index
+	 * @param i Choice index
+	 * @param u Set of states {@code u}
+	 * @param v Set of states {@code v}
+	 */
+	public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v);
+	
 	/**
 	 * Do a matrix-vector multiplication followed by min/max, i.e. one step of value iteration,
 	 * i.e. for all s: result[s] = min/max_k { sum_j P_k(s,j)*vect[j] }
@ -134,10 +145,10 @@ public interface MDP extends NondetModel
 	/**
 	 * Do a single row of matrix-vector multiplication for a specific choice.
 	 * @param s State (row) index
-	 * @param k Choice index
+	 * @param i Choice index
 	 * @param vect Vector to multiply by
 	 */
-	public double mvMultSingle(int s, int k, double vect[]);
+	public double mvMultSingle(int s, int i, double vect[]);

 	/**
 	 * Do a Gauss-Seidel-style matrix-vector multiplication followed by min/max.
@ -171,14 +182,14 @@ public interface MDP extends NondetModel
 	 * Do a single row of Jacobi-style matrix-vector multiplication for a specific choice.
 	 * i.e. return min/max_k { (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
 	 * @param s Row index
-	 * @param k Choice index
+	 * @param i Choice index
 	 * @param vect Vector to multiply by
 	 */
-	public double mvMultJacSingle(int s, int k, double vect[]);
+	public double mvMultJacSingle(int s, int i, double vect[]);

 	/**
-	 * Do a matrix-vector multiplication and sum of action reward followed by min/max, i.e. one step of value iteration.
-	 * i.e. for all s: result[s] = min/max_k { rew(s) + sum_j P_k(s,j)*vect[j] }
+	 * Do a matrix-vector multiplication and sum of rewards followed by min/max, i.e. one step of value iteration.
+	 * i.e. for all s: result[s] = min/max_k { rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j] }
 	 * Optionally, store optimal (memoryless) strategy info. 
 	 * @param vect Vector to multiply by
 	 * @param mdpRewards The rewards
@ -191,8 +202,30 @@ public interface MDP extends NondetModel
 	public void mvMultRewMinMax(double vect[], MDPRewards mdpRewards, boolean min, double result[], BitSet subset, boolean complement, int strat[]);

 	/**
-	 * Do a Gauss-Seidel-style matrix-vector multiplication and sum of action reward followed by min/max.
-	 * i.e. for all s: vect[s] = min/max_k { rew(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
+	 * Do a single row of matrix-vector multiplication and sum of rewards followed by min/max.
+	 * i.e. return min/max_k { rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j] }
+	 * Optionally, store optimal (memoryless) strategy info. 
+	 * @param s Row index
+	 * @param vect Vector to multiply by
+	 * @param mdpRewards The rewards
+	 * @param min Min or max for (true=min, false=max)
+	 * @param strat Storage for (memoryless) strategy choice indices (ignored if null)
+	 */
+	public double mvMultRewMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]);
+
+	/**
+	 * Do a single row of matrix-vector multiplication and sum of rewards for a specific choice.
+	 * i.e. rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j]
+	 * @param s State (row) index
+	 * @param i Choice index
+	 * @param vect Vector to multiply by
+	 * @param mcRewards The rewards
+	 */
+	public double mvMultRewSingle(int s, int i, double vect[], MCRewards mcRewards);
+
+	/**
+	 * Do a Gauss-Seidel-style matrix-vector multiplication and sum of rewards followed by min/max.
+	 * i.e. for all s: vect[s] = min/max_k { rew(s) + rew_k(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
 	 * and store new values directly in {@code vect} as computed.
 	 * The maximum (absolute/relative) difference between old/new
 	 * elements of {@code vect} is also returned.
@ -209,22 +242,10 @@ public interface MDP extends NondetModel
 	public double mvMultRewGSMinMax(double vect[], MDPRewards mdpRewards, boolean min, BitSet subset, boolean complement, boolean absolute, int strat[]);

 	/**
-	 * Do a single row of matrix-vector multiplication and sum of action reward followed by min/max.
-	 * i.e. return min/max_k { rew(s) + sum_j P_k(s,j)*vect[j] }
+	 * Do a single row of Jacobi-style matrix-vector multiplication and sum of rewards followed by min/max.
+	 * i.e. return min/max_k { rew(s) + rew_k(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
 	 * Optionally, store optimal (memoryless) strategy info. 
-	 * @param s Row index
-	 * @param vect Vector to multiply by
-	 * @param mdpRewards The rewards
-	 * @param min Min or max for (true=min, false=max)
-	 * @param strat Storage for (memoryless) strategy choice indices (ignored if null)
-	 */
-	public double mvMultRewMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]);
-
-	/**
-	 * Do a single row of Jacobi-style matrix-vector multiplication and sum of action reward followed by min/max.
-	 * i.e. return min/max_k { (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) }
-	 * Optionally, store optimal (memoryless) strategy info. 
-	 * @param s Row index
+	 * @param s State (row) index
 	 * @param vect Vector to multiply by
 	 * @param mdpRewards The rewards
 	 * @param min Min or max for (true=min, false=max)
@ -233,8 +254,8 @@ public interface MDP extends NondetModel
 	public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]);

 	/**
-	 * Determine which choices result in min/max after a single row of matrix-vector multiplication and sum of action reward.
-	 * @param s Row index
+	 * Determine which choices result in min/max after a single row of matrix-vector multiplication and sum of rewards.
+	 * @param s State (row) index
 	 * @param vect Vector to multiply by
 	 * @param mdpRewards The rewards
 	 * @param min Min or max (true=min, false=max)
--- a/prism/src/explicit/MDPModelChecker.java
+++ b/prism/src/explicit/MDPModelChecker.java
@ -47,6 +47,8 @@ import prism.PrismLangException;
 import prism.PrismLog;
 import prism.PrismUtils;
 import strat.MDStrategyArray;
+import explicit.rewards.MCRewards;
+import explicit.rewards.MCRewardsFromMDPRewards;
 import explicit.rewards.MDPRewards;

 /**
@ -980,6 +982,7 @@ public class MDPModelChecker extends ProbModelChecker

 	/**
 	 * Compute reachability probabilities using policy iteration.
+	 * Optionally, store optimal (memoryless) strategy info. 
 	 * @param mdp: The MDP
 	 * @param no: Probability 0 states
 	 * @param yes: Probability 1 states
@ -999,7 +1002,7 @@ public class MDPModelChecker extends ProbModelChecker
 		// Re-use solution to solve each new policy (strategy)?
 		boolean reUseSoln = true;

-		// Start value iteration
+		// Start policy iteration
 		timer = System.currentTimeMillis();
 		mainLog.println("Starting policy iteration (" + (min ? "min" : "max") + ")...");

@ -1393,7 +1396,7 @@ public class MDPModelChecker extends ProbModelChecker
 		MDPSolnMethod mdpSolnMethod = this.mdpSolnMethod;

 		// Switch to a supported method, if necessary
-		if (!(mdpSolnMethod == MDPSolnMethod.VALUE_ITERATION || mdpSolnMethod == MDPSolnMethod.GAUSS_SEIDEL)) {
+		if (!(mdpSolnMethod == MDPSolnMethod.VALUE_ITERATION || mdpSolnMethod == MDPSolnMethod.GAUSS_SEIDEL || mdpSolnMethod == MDPSolnMethod.POLICY_ITERATION)) {
 			mdpSolnMethod = MDPSolnMethod.GAUSS_SEIDEL;
 			mainLog.printWarning("Switching to MDP solution method \"" + mdpSolnMethod.fullName() + "\"");
 		}
@ -1469,6 +1472,9 @@ public class MDPModelChecker extends ProbModelChecker
 		case GAUSS_SEIDEL:
 			res = computeReachRewardsGaussSeidel(mdp, mdpRewards, target, inf, min, init, known, strat);
 			break;
+		case POLICY_ITERATION:
+			res = computeReachRewardsPolIter(mdp, mdpRewards, target, inf, min, strat);
+			break;
 		default:
 			throw new PrismException("Unknown MDP solution method " + mdpSolnMethod.fullName());
 		}
@ -1684,6 +1690,124 @@ public class MDPModelChecker extends ProbModelChecker
 		return res;
 	}

+	/**
+	 * Compute expected reachability rewards using policy iteration.
+	 * Optionally, store optimal (memoryless) strategy info. 
+	 * @param mdp The MDP
+	 * @param mdpRewards The rewards
+	 * @param target Target states
+	 * @param inf States for which reward is infinite
+	 * @param min Min or max rewards (true=min, false=max)
+	 * @param strat Storage for (memoryless) strategy choice indices (ignored if null)
+	 */
+	protected ModelCheckerResult computeReachRewardsPolIter(MDP mdp, MDPRewards mdpRewards, BitSet target, BitSet inf, boolean min, int strat[])
+			throws PrismException
+	{
+		ModelCheckerResult res;
+		int i, n, iters, totalIters;
+		double soln[], soln2[];
+		boolean done;
+		long timer;
+		DTMCModelChecker mcDTMC;
+		DTMC dtmc;
+		MCRewards mcRewards;
+
+		// Re-use solution to solve each new policy (strategy)?
+		boolean reUseSoln = true;
+
+		// Start policy iteration
+		timer = System.currentTimeMillis();
+		mainLog.println("Starting policy iteration (" + (min ? "min" : "max") + ")...");
+
+		// Create a DTMC model checker (for solving policies)
+		mcDTMC = new DTMCModelChecker(this);
+		mcDTMC.inheritSettings(this);
+		mcDTMC.setLog(new PrismDevNullLog());
+
+		// Store num states
+		n = mdp.getNumStates();
+
+		// Create solution vector(s)
+		soln = new double[n];
+		soln2 = new double[n];
+
+		// Initialise solution vectors.
+		for (i = 0; i < n; i++)
+			soln[i] = soln2[i] = target.get(i) ? 0.0 : inf.get(i) ? Double.POSITIVE_INFINITY : 0.0;
+
+		// If not passed in, create new storage for strategy and initialise
+		// Initial strategy just picks first choice (0) everywhere
+		if (strat == null) {
+			strat = new int[n];
+			for (i = 0; i < n; i++)
+				strat[i] = 0;
+		}
+		// Otherwise, just initialise for states not in target/inf
+		// (Optimal choices for target/inf should already be known)
+		else {
+			for (i = 0; i < n; i++)
+				if (!(target.get(i) || inf.get(i)))
+					strat[i] = 0;
+		}
+		// For minimum rewards, we need to make sure that initial strategy choices
+		// do not result in infinite rewards for any states that are know not to be infinite
+		// (otherwise policy iteration may not converge) 
+		if (min) {
+			for (i = 0; i < n; i++) {
+				if (!(target.get(i) || inf.get(i))) {
+					int numChoices = mdp.getNumChoices(i);
+					for (int k = 0; k < numChoices; k++) {
+						if (!mdp.someSuccessorsInSet(i, k, inf)) {
+							strat[i] = k;
+							continue;
+						}
+					}
+				}
+			}
+		}
+			
+		// Start iterations
+		iters = totalIters = 0;
+		done = false;
+		while (!done && iters < maxIters) {
+			iters++;
+			// Solve induced DTMC for strategy
+			dtmc = new DTMCFromMDPMemorylessAdversary(mdp, strat);
+			mcRewards = new MCRewardsFromMDPRewards(mdpRewards, strat);
+			res = mcDTMC.computeReachRewards(dtmc, mcRewards, target, reUseSoln ? soln : null, null);
+			soln = res.soln;
+			totalIters += res.numIters;
+			// Check if optimal, improve non-optimal choices
+			mdp.mvMultRewMinMax(soln, mdpRewards, min, soln2, null, false, null);
+			done = true;
+			for (i = 0; i < n; i++) {
+				// Don't look at target/inf states - we may not have strategy info for them,
+				// so they might appear non-optimal
+				if (target.get(i) || inf.get(i))
+					continue;
+				if (!PrismUtils.doublesAreClose(soln[i], soln2[i], termCritParam, termCrit == TermCrit.ABSOLUTE)) {
+					done = false;
+					List<Integer> opt = mdp.mvMultRewMinMaxSingleChoices(i, soln, mdpRewards, min, soln2[i]);
+					// Only update strategy if strictly better
+					if (!opt.contains(strat[i]))
+						strat[i] = opt.get(0);
+				}
+			}
+		}
+
+		// Finished policy iteration
+		timer = System.currentTimeMillis() - timer;
+		mainLog.print("Policy iteration");
+		mainLog.println(" took " + iters + " cycles (" + totalIters + " iterations in total) and " + timer / 1000.0 + " seconds.");
+
+		// Return results
+		res = new ModelCheckerResult();
+		res.soln = soln;
+		res.numIters = totalIters;
+		res.timeTaken = timer / 1000.0;
+		return res;
+	}
+
 	/**
 	 * Construct strategy information for min/max expected reachability.
 	 * (More precisely, list of indices of choices resulting in min/max.)
--- a/prism/src/explicit/MDPSimple.java
+++ b/prism/src/explicit/MDPSimple.java
@ -41,6 +41,7 @@ import java.util.Map.Entry;

 import prism.PrismException;
 import prism.PrismUtils;
+import explicit.rewards.MCRewards;
 import explicit.rewards.MDPRewards;

 /**
@ -671,6 +672,13 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
 		}
 	}

+	@Override
+	public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v)
+	{
+		Distribution distr = trans.get(s).get(i);
+		return distr.containsOneOf(v) && distr.isSubsetOf(u);
+	}
+
 	@Override
 	public double mvMultMinMaxSingle(int s, double vect[], boolean min, int strat[])
 	{
@ -748,11 +756,12 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
 	}

 	@Override
-	public double mvMultSingle(int s, int k, double vect[])
+	public double mvMultSingle(int s, int i, double vect[])
 	{
 		double d, prob;
+		int k;

-		Distribution distr = trans.get(s).get(k);
+		Distribution distr = trans.get(s).get(i);
 		// Compute sum for this distribution
 		d = 0.0;
 		for (Map.Entry<Integer, Double> e : distr) {
@ -816,19 +825,20 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
 	}

 	@Override
-	public double mvMultJacSingle(int s, int k, double vect[])
+	public double mvMultJacSingle(int s, int i, double vect[])
 	{
 		double diag, d, prob;
+		int k;
 		Distribution distr;

-		distr = trans.get(s).get(k);
+		distr = trans.get(s).get(i);
 		diag = 1.0;
 		// Compute sum for this distribution
 		d = 0.0;
 		for (Map.Entry<Integer, Double> e : distr) {
 			k = (Integer) e.getKey();
 			prob = (Double) e.getValue();
-			if (k != s) {
+			if (i != s) {
 				d += prob * vect[k];
 			} else {
 				diag -= prob;
@ -885,6 +895,27 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple
 		return minmax;
 	}

+	@Override
+	public double mvMultRewSingle(int s, int i, double[] vect, MCRewards mcRewards)
+	{
+		double d, prob;
+		int k;
+
+		Distribution distr = trans.get(s).get(i);
+		// Compute sum for this distribution
+		// TODO: use transition rewards when added to DTMCss
+		// d = mcRewards.getTransitionReward(s);
+		d = 0;
+		for (Map.Entry<Integer, Double> e : distr) {
+			k = (Integer) e.getKey();
+			prob = (Double) e.getValue();
+			d += prob * vect[k];
+		}
+		d += mcRewards.getStateReward(s);
+		
+		return d;
+	}
+	
 	@Override
 	public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[])
 	{
--- a/prism/src/explicit/MDPSparse.java
+++ b/prism/src/explicit/MDPSparse.java
@ -43,6 +43,7 @@ import java.util.TreeMap;
 import parser.State;
 import prism.PrismException;
 import prism.PrismUtils;
+import explicit.rewards.MCRewards;
 import explicit.rewards.MDPRewards;

 /**
@ -754,6 +755,29 @@ public class MDPSparse extends MDPExplicit
 		}
 	}

+	@Override
+	public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v)
+	{
+		int j, k, l2, h2;
+		boolean some, all;
+
+		j = rowStarts[s] + i;
+		some = false;
+		all = true;
+		l2 = choiceStarts[j];
+		h2 = choiceStarts[j + 1];
+		for (k = l2; k < h2; k++) {
+			// Assume that only non-zero entries are stored
+			if (v.get(cols[k])) {
+				some = true;
+			}
+			if (!u.get(cols[k])) {
+				all = false;
+			}
+		}
+		return some && all;
+	}
+
 	@Override
 	public double mvMultMinMaxSingle(int s, double vect[], boolean min, int strat[])
 	{
@ -825,12 +849,12 @@ public class MDPSparse extends MDPExplicit
 	}

 	@Override
-	public double mvMultSingle(int s, int k, double vect[])
+	public double mvMultSingle(int s, int i, double vect[])
 	{
-		int j, l2, h2;
+		int j, k, l2, h2;
 		double d;

-		j = rowStarts[s] + k;
+		j = rowStarts[s] + i;
 		// Compute sum for this distribution
 		d = 0.0;
 		l2 = choiceStarts[j];
@ -891,12 +915,12 @@ public class MDPSparse extends MDPExplicit
 	}

 	@Override
-	public double mvMultJacSingle(int s, int k, double vect[])
+	public double mvMultJacSingle(int s, int i, double vect[])
 	{
-		int j, l2, h2;
+		int j, k, l2, h2;
 		double diag, d;

-		j = rowStarts[s] + k;
+		j = rowStarts[s] + i;
 		diag = 1.0;
 		// Compute sum for this distribution
 		d = 0.0;
@ -958,6 +982,26 @@ public class MDPSparse extends MDPExplicit
 		return minmax;
 	}

+	@Override
+	public double mvMultRewSingle(int s, int i, double[] vect, MCRewards mcRewards)
+	{
+		int j, k, l2, h2;
+		double d;
+
+		j = rowStarts[s] + i;
+		// Compute sum for this distribution
+		// TODO: use transition rewards when added to DTMCss
+		// d = mcRewards.getTransitionReward(s);
+		d = 0;
+		l2 = choiceStarts[j];
+		h2 = choiceStarts[j + 1];
+		for (k = l2; k < h2; k++) {
+			d += nonZeros[k] * vect[cols[k]];
+		}
+		d += mcRewards.getStateReward(s);
+		return d;
+	}
+	
 	@Override
 	public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[])
 	{
--- a/prism/src/explicit/rewards/MCRewardsFromMDPRewards.java
+++ b/prism/src/explicit/rewards/MCRewardsFromMDPRewards.java
@ -0,0 +1,58 @@
+//==============================================================================
+//	
+//	Copyright (c) 2002-
+//	Authors:
+//	* Dave Parker <d.a.parker@cs.bham.ac.uk> (University of Birmingham/Oxford)
+//	
+//------------------------------------------------------------------------------
+//	
+//	This file is part of PRISM.
+//	
+//	PRISM is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//	
+//	PRISM is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//	GNU General Public License for more details.
+//	
+//	You should have received a copy of the GNU General Public License
+//	along with PRISM; if not, write to the Free Software Foundation,
+//	Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//	
+//==============================================================================
+
+package explicit.rewards;
+
+
+/**
+ * Explicit-state representation of a DTMC rewards structure, constructed (implicitly)
+ * from an MDP rewards structure and a memoryless deterministic strategy, specified as an array of integer indices.
+ * This class is read-only: most of data is pointers to other model info.
+ */
+public class MCRewardsFromMDPRewards implements MCRewards
+{
+	// MDP rewards
+	protected MDPRewards mdpRewards;
+	// Strategy (array of choice indices; -1 denotes no choice)
+	protected int strat[];
+
+	/**
+	 * Constructor: create from MDP rewards and memoryless adversary.
+	 */
+	public MCRewardsFromMDPRewards(MDPRewards mdpRewards, int strat[])
+	{
+		this.mdpRewards = mdpRewards;
+		this.strat = strat;
+	}
+
+	@Override
+	public double getStateReward(int s)
+	{
+		// For now, state/transition rewards from MDP are both put into state reward
+		// This works fine for cumulative rewards, but not instantaneous ones
+		return mdpRewards.getStateReward(s) + mdpRewards.getTransitionReward(s, strat[s]);
+	}
+}