From 4d3d52edaf63b5b517437ab3b2e84a2c87efe4ba Mon Sep 17 00:00:00 2001 From: Dave Parker Date: Thu, 5 Dec 2013 20:03:00 +0000 Subject: [PATCH] Policy iteration for reachability reward problems (explicit engine). git-svn-id: https://www.prismmodelchecker.org/svn/prism/prism/trunk@7667 bbc10eb1-c90d-0410-af57-cb519fbb1720 --- .../DTMCFromMDPMemorylessAdversary.java | 10 +- prism/src/explicit/MDP.java | 71 ++++++---- prism/src/explicit/MDPModelChecker.java | 128 +++++++++++++++++- prism/src/explicit/MDPSimple.java | 41 +++++- prism/src/explicit/MDPSparse.java | 56 +++++++- .../rewards/MCRewardsFromMDPRewards.java | 58 ++++++++ 6 files changed, 322 insertions(+), 42 deletions(-) create mode 100644 prism/src/explicit/rewards/MCRewardsFromMDPRewards.java diff --git a/prism/src/explicit/DTMCFromMDPMemorylessAdversary.java b/prism/src/explicit/DTMCFromMDPMemorylessAdversary.java index 0ed0f799..826f81bf 100644 --- a/prism/src/explicit/DTMCFromMDPMemorylessAdversary.java +++ b/prism/src/explicit/DTMCFromMDPMemorylessAdversary.java @@ -216,8 +216,11 @@ public class DTMCFromMDPMemorylessAdversary extends DTMCExplicit public void prob1step(BitSet subset, BitSet u, BitSet v, BitSet result) { - // TODO - throw new Error("Not yet supported"); + for (int s = 0; s < numStates; s++) { + if (subset.get(s)) { + result.set(s, mdp.prob1stepSingle(s, adv[s], u, v)); + } + } } @Override @@ -235,8 +238,7 @@ public class DTMCFromMDPMemorylessAdversary extends DTMCExplicit @Override public double mvMultRewSingle(int s, double vect[], MCRewards mcRewards) { - throw new RuntimeException("Not implemented yet"); - //return mdp.mvMultRewSingle(s, adv[s], vect); + return adv[s] >= 0 ? mdp.mvMultRewSingle(s, adv[s], vect, mcRewards) : 0; } @Override diff --git a/prism/src/explicit/MDP.java b/prism/src/explicit/MDP.java index 9d365b5e..8835e3dd 100644 --- a/prism/src/explicit/MDP.java +++ b/prism/src/explicit/MDP.java @@ -32,6 +32,7 @@ import java.util.List; import java.util.Map.Entry; import prism.PrismException; +import explicit.rewards.MCRewards; import explicit.rewards.MDPRewards; /** @@ -98,6 +99,16 @@ public interface MDP extends NondetModel */ public void prob1step(BitSet subset, BitSet u, BitSet v, boolean forall, BitSet result); + /** + * Perform a single step of precomputation algorithm Prob1 for a single state/choice, + * i.e., return whether there is a transition to a state in {@code v} and all transitions go to states in {@code u}. + * @param s State (row) index + * @param i Choice index + * @param u Set of states {@code u} + * @param v Set of states {@code v} + */ + public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v); + /** * Do a matrix-vector multiplication followed by min/max, i.e. one step of value iteration, * i.e. for all s: result[s] = min/max_k { sum_j P_k(s,j)*vect[j] } @@ -134,10 +145,10 @@ public interface MDP extends NondetModel /** * Do a single row of matrix-vector multiplication for a specific choice. * @param s State (row) index - * @param k Choice index + * @param i Choice index * @param vect Vector to multiply by */ - public double mvMultSingle(int s, int k, double vect[]); + public double mvMultSingle(int s, int i, double vect[]); /** * Do a Gauss-Seidel-style matrix-vector multiplication followed by min/max. @@ -171,14 +182,14 @@ public interface MDP extends NondetModel * Do a single row of Jacobi-style matrix-vector multiplication for a specific choice. * i.e. return min/max_k { (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) } * @param s Row index - * @param k Choice index + * @param i Choice index * @param vect Vector to multiply by */ - public double mvMultJacSingle(int s, int k, double vect[]); + public double mvMultJacSingle(int s, int i, double vect[]); /** - * Do a matrix-vector multiplication and sum of action reward followed by min/max, i.e. one step of value iteration. - * i.e. for all s: result[s] = min/max_k { rew(s) + sum_j P_k(s,j)*vect[j] } + * Do a matrix-vector multiplication and sum of rewards followed by min/max, i.e. one step of value iteration. + * i.e. for all s: result[s] = min/max_k { rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j] } * Optionally, store optimal (memoryless) strategy info. * @param vect Vector to multiply by * @param mdpRewards The rewards @@ -191,8 +202,30 @@ public interface MDP extends NondetModel public void mvMultRewMinMax(double vect[], MDPRewards mdpRewards, boolean min, double result[], BitSet subset, boolean complement, int strat[]); /** - * Do a Gauss-Seidel-style matrix-vector multiplication and sum of action reward followed by min/max. - * i.e. for all s: vect[s] = min/max_k { rew(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) } + * Do a single row of matrix-vector multiplication and sum of rewards followed by min/max. + * i.e. return min/max_k { rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j] } + * Optionally, store optimal (memoryless) strategy info. + * @param s Row index + * @param vect Vector to multiply by + * @param mdpRewards The rewards + * @param min Min or max for (true=min, false=max) + * @param strat Storage for (memoryless) strategy choice indices (ignored if null) + */ + public double mvMultRewMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]); + + /** + * Do a single row of matrix-vector multiplication and sum of rewards for a specific choice. + * i.e. rew(s) + rew_k(s) + sum_j P_k(s,j)*vect[j] + * @param s State (row) index + * @param i Choice index + * @param vect Vector to multiply by + * @param mcRewards The rewards + */ + public double mvMultRewSingle(int s, int i, double vect[], MCRewards mcRewards); + + /** + * Do a Gauss-Seidel-style matrix-vector multiplication and sum of rewards followed by min/max. + * i.e. for all s: vect[s] = min/max_k { rew(s) + rew_k(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) } * and store new values directly in {@code vect} as computed. * The maximum (absolute/relative) difference between old/new * elements of {@code vect} is also returned. @@ -209,22 +242,10 @@ public interface MDP extends NondetModel public double mvMultRewGSMinMax(double vect[], MDPRewards mdpRewards, boolean min, BitSet subset, boolean complement, boolean absolute, int strat[]); /** - * Do a single row of matrix-vector multiplication and sum of action reward followed by min/max. - * i.e. return min/max_k { rew(s) + sum_j P_k(s,j)*vect[j] } + * Do a single row of Jacobi-style matrix-vector multiplication and sum of rewards followed by min/max. + * i.e. return min/max_k { rew(s) + rew_k(s) + (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) } * Optionally, store optimal (memoryless) strategy info. - * @param s Row index - * @param vect Vector to multiply by - * @param mdpRewards The rewards - * @param min Min or max for (true=min, false=max) - * @param strat Storage for (memoryless) strategy choice indices (ignored if null) - */ - public double mvMultRewMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]); - - /** - * Do a single row of Jacobi-style matrix-vector multiplication and sum of action reward followed by min/max. - * i.e. return min/max_k { (sum_{j!=s} P_k(s,j)*vect[j]) / 1-P_k(s,s) } - * Optionally, store optimal (memoryless) strategy info. - * @param s Row index + * @param s State (row) index * @param vect Vector to multiply by * @param mdpRewards The rewards * @param min Min or max for (true=min, false=max) @@ -233,8 +254,8 @@ public interface MDP extends NondetModel public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]); /** - * Determine which choices result in min/max after a single row of matrix-vector multiplication and sum of action reward. - * @param s Row index + * Determine which choices result in min/max after a single row of matrix-vector multiplication and sum of rewards. + * @param s State (row) index * @param vect Vector to multiply by * @param mdpRewards The rewards * @param min Min or max (true=min, false=max) diff --git a/prism/src/explicit/MDPModelChecker.java b/prism/src/explicit/MDPModelChecker.java index 28f710ac..68e39d9e 100644 --- a/prism/src/explicit/MDPModelChecker.java +++ b/prism/src/explicit/MDPModelChecker.java @@ -47,6 +47,8 @@ import prism.PrismLangException; import prism.PrismLog; import prism.PrismUtils; import strat.MDStrategyArray; +import explicit.rewards.MCRewards; +import explicit.rewards.MCRewardsFromMDPRewards; import explicit.rewards.MDPRewards; /** @@ -980,6 +982,7 @@ public class MDPModelChecker extends ProbModelChecker /** * Compute reachability probabilities using policy iteration. + * Optionally, store optimal (memoryless) strategy info. * @param mdp: The MDP * @param no: Probability 0 states * @param yes: Probability 1 states @@ -999,7 +1002,7 @@ public class MDPModelChecker extends ProbModelChecker // Re-use solution to solve each new policy (strategy)? boolean reUseSoln = true; - // Start value iteration + // Start policy iteration timer = System.currentTimeMillis(); mainLog.println("Starting policy iteration (" + (min ? "min" : "max") + ")..."); @@ -1393,7 +1396,7 @@ public class MDPModelChecker extends ProbModelChecker MDPSolnMethod mdpSolnMethod = this.mdpSolnMethod; // Switch to a supported method, if necessary - if (!(mdpSolnMethod == MDPSolnMethod.VALUE_ITERATION || mdpSolnMethod == MDPSolnMethod.GAUSS_SEIDEL)) { + if (!(mdpSolnMethod == MDPSolnMethod.VALUE_ITERATION || mdpSolnMethod == MDPSolnMethod.GAUSS_SEIDEL || mdpSolnMethod == MDPSolnMethod.POLICY_ITERATION)) { mdpSolnMethod = MDPSolnMethod.GAUSS_SEIDEL; mainLog.printWarning("Switching to MDP solution method \"" + mdpSolnMethod.fullName() + "\""); } @@ -1469,6 +1472,9 @@ public class MDPModelChecker extends ProbModelChecker case GAUSS_SEIDEL: res = computeReachRewardsGaussSeidel(mdp, mdpRewards, target, inf, min, init, known, strat); break; + case POLICY_ITERATION: + res = computeReachRewardsPolIter(mdp, mdpRewards, target, inf, min, strat); + break; default: throw new PrismException("Unknown MDP solution method " + mdpSolnMethod.fullName()); } @@ -1684,6 +1690,124 @@ public class MDPModelChecker extends ProbModelChecker return res; } + /** + * Compute expected reachability rewards using policy iteration. + * Optionally, store optimal (memoryless) strategy info. + * @param mdp The MDP + * @param mdpRewards The rewards + * @param target Target states + * @param inf States for which reward is infinite + * @param min Min or max rewards (true=min, false=max) + * @param strat Storage for (memoryless) strategy choice indices (ignored if null) + */ + protected ModelCheckerResult computeReachRewardsPolIter(MDP mdp, MDPRewards mdpRewards, BitSet target, BitSet inf, boolean min, int strat[]) + throws PrismException + { + ModelCheckerResult res; + int i, n, iters, totalIters; + double soln[], soln2[]; + boolean done; + long timer; + DTMCModelChecker mcDTMC; + DTMC dtmc; + MCRewards mcRewards; + + // Re-use solution to solve each new policy (strategy)? + boolean reUseSoln = true; + + // Start policy iteration + timer = System.currentTimeMillis(); + mainLog.println("Starting policy iteration (" + (min ? "min" : "max") + ")..."); + + // Create a DTMC model checker (for solving policies) + mcDTMC = new DTMCModelChecker(this); + mcDTMC.inheritSettings(this); + mcDTMC.setLog(new PrismDevNullLog()); + + // Store num states + n = mdp.getNumStates(); + + // Create solution vector(s) + soln = new double[n]; + soln2 = new double[n]; + + // Initialise solution vectors. + for (i = 0; i < n; i++) + soln[i] = soln2[i] = target.get(i) ? 0.0 : inf.get(i) ? Double.POSITIVE_INFINITY : 0.0; + + // If not passed in, create new storage for strategy and initialise + // Initial strategy just picks first choice (0) everywhere + if (strat == null) { + strat = new int[n]; + for (i = 0; i < n; i++) + strat[i] = 0; + } + // Otherwise, just initialise for states not in target/inf + // (Optimal choices for target/inf should already be known) + else { + for (i = 0; i < n; i++) + if (!(target.get(i) || inf.get(i))) + strat[i] = 0; + } + // For minimum rewards, we need to make sure that initial strategy choices + // do not result in infinite rewards for any states that are know not to be infinite + // (otherwise policy iteration may not converge) + if (min) { + for (i = 0; i < n; i++) { + if (!(target.get(i) || inf.get(i))) { + int numChoices = mdp.getNumChoices(i); + for (int k = 0; k < numChoices; k++) { + if (!mdp.someSuccessorsInSet(i, k, inf)) { + strat[i] = k; + continue; + } + } + } + } + } + + // Start iterations + iters = totalIters = 0; + done = false; + while (!done && iters < maxIters) { + iters++; + // Solve induced DTMC for strategy + dtmc = new DTMCFromMDPMemorylessAdversary(mdp, strat); + mcRewards = new MCRewardsFromMDPRewards(mdpRewards, strat); + res = mcDTMC.computeReachRewards(dtmc, mcRewards, target, reUseSoln ? soln : null, null); + soln = res.soln; + totalIters += res.numIters; + // Check if optimal, improve non-optimal choices + mdp.mvMultRewMinMax(soln, mdpRewards, min, soln2, null, false, null); + done = true; + for (i = 0; i < n; i++) { + // Don't look at target/inf states - we may not have strategy info for them, + // so they might appear non-optimal + if (target.get(i) || inf.get(i)) + continue; + if (!PrismUtils.doublesAreClose(soln[i], soln2[i], termCritParam, termCrit == TermCrit.ABSOLUTE)) { + done = false; + List opt = mdp.mvMultRewMinMaxSingleChoices(i, soln, mdpRewards, min, soln2[i]); + // Only update strategy if strictly better + if (!opt.contains(strat[i])) + strat[i] = opt.get(0); + } + } + } + + // Finished policy iteration + timer = System.currentTimeMillis() - timer; + mainLog.print("Policy iteration"); + mainLog.println(" took " + iters + " cycles (" + totalIters + " iterations in total) and " + timer / 1000.0 + " seconds."); + + // Return results + res = new ModelCheckerResult(); + res.soln = soln; + res.numIters = totalIters; + res.timeTaken = timer / 1000.0; + return res; + } + /** * Construct strategy information for min/max expected reachability. * (More precisely, list of indices of choices resulting in min/max.) diff --git a/prism/src/explicit/MDPSimple.java b/prism/src/explicit/MDPSimple.java index 85bd32ba..68c74cf2 100644 --- a/prism/src/explicit/MDPSimple.java +++ b/prism/src/explicit/MDPSimple.java @@ -41,6 +41,7 @@ import java.util.Map.Entry; import prism.PrismException; import prism.PrismUtils; +import explicit.rewards.MCRewards; import explicit.rewards.MDPRewards; /** @@ -671,6 +672,13 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple } } + @Override + public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v) + { + Distribution distr = trans.get(s).get(i); + return distr.containsOneOf(v) && distr.isSubsetOf(u); + } + @Override public double mvMultMinMaxSingle(int s, double vect[], boolean min, int strat[]) { @@ -748,11 +756,12 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple } @Override - public double mvMultSingle(int s, int k, double vect[]) + public double mvMultSingle(int s, int i, double vect[]) { double d, prob; + int k; - Distribution distr = trans.get(s).get(k); + Distribution distr = trans.get(s).get(i); // Compute sum for this distribution d = 0.0; for (Map.Entry e : distr) { @@ -816,19 +825,20 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple } @Override - public double mvMultJacSingle(int s, int k, double vect[]) + public double mvMultJacSingle(int s, int i, double vect[]) { double diag, d, prob; + int k; Distribution distr; - distr = trans.get(s).get(k); + distr = trans.get(s).get(i); diag = 1.0; // Compute sum for this distribution d = 0.0; for (Map.Entry e : distr) { k = (Integer) e.getKey(); prob = (Double) e.getValue(); - if (k != s) { + if (i != s) { d += prob * vect[k]; } else { diag -= prob; @@ -885,6 +895,27 @@ public class MDPSimple extends MDPExplicit implements NondetModelSimple return minmax; } + @Override + public double mvMultRewSingle(int s, int i, double[] vect, MCRewards mcRewards) + { + double d, prob; + int k; + + Distribution distr = trans.get(s).get(i); + // Compute sum for this distribution + // TODO: use transition rewards when added to DTMCss + // d = mcRewards.getTransitionReward(s); + d = 0; + for (Map.Entry e : distr) { + k = (Integer) e.getKey(); + prob = (Double) e.getValue(); + d += prob * vect[k]; + } + d += mcRewards.getStateReward(s); + + return d; + } + @Override public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]) { diff --git a/prism/src/explicit/MDPSparse.java b/prism/src/explicit/MDPSparse.java index ac71e797..450d7479 100644 --- a/prism/src/explicit/MDPSparse.java +++ b/prism/src/explicit/MDPSparse.java @@ -43,6 +43,7 @@ import java.util.TreeMap; import parser.State; import prism.PrismException; import prism.PrismUtils; +import explicit.rewards.MCRewards; import explicit.rewards.MDPRewards; /** @@ -754,6 +755,29 @@ public class MDPSparse extends MDPExplicit } } + @Override + public boolean prob1stepSingle(int s, int i, BitSet u, BitSet v) + { + int j, k, l2, h2; + boolean some, all; + + j = rowStarts[s] + i; + some = false; + all = true; + l2 = choiceStarts[j]; + h2 = choiceStarts[j + 1]; + for (k = l2; k < h2; k++) { + // Assume that only non-zero entries are stored + if (v.get(cols[k])) { + some = true; + } + if (!u.get(cols[k])) { + all = false; + } + } + return some && all; + } + @Override public double mvMultMinMaxSingle(int s, double vect[], boolean min, int strat[]) { @@ -825,12 +849,12 @@ public class MDPSparse extends MDPExplicit } @Override - public double mvMultSingle(int s, int k, double vect[]) + public double mvMultSingle(int s, int i, double vect[]) { - int j, l2, h2; + int j, k, l2, h2; double d; - j = rowStarts[s] + k; + j = rowStarts[s] + i; // Compute sum for this distribution d = 0.0; l2 = choiceStarts[j]; @@ -891,12 +915,12 @@ public class MDPSparse extends MDPExplicit } @Override - public double mvMultJacSingle(int s, int k, double vect[]) + public double mvMultJacSingle(int s, int i, double vect[]) { - int j, l2, h2; + int j, k, l2, h2; double diag, d; - j = rowStarts[s] + k; + j = rowStarts[s] + i; diag = 1.0; // Compute sum for this distribution d = 0.0; @@ -958,6 +982,26 @@ public class MDPSparse extends MDPExplicit return minmax; } + @Override + public double mvMultRewSingle(int s, int i, double[] vect, MCRewards mcRewards) + { + int j, k, l2, h2; + double d; + + j = rowStarts[s] + i; + // Compute sum for this distribution + // TODO: use transition rewards when added to DTMCss + // d = mcRewards.getTransitionReward(s); + d = 0; + l2 = choiceStarts[j]; + h2 = choiceStarts[j + 1]; + for (k = l2; k < h2; k++) { + d += nonZeros[k] * vect[cols[k]]; + } + d += mcRewards.getStateReward(s); + return d; + } + @Override public double mvMultRewJacMinMaxSingle(int s, double vect[], MDPRewards mdpRewards, boolean min, int strat[]) { diff --git a/prism/src/explicit/rewards/MCRewardsFromMDPRewards.java b/prism/src/explicit/rewards/MCRewardsFromMDPRewards.java new file mode 100644 index 00000000..0519e8e8 --- /dev/null +++ b/prism/src/explicit/rewards/MCRewardsFromMDPRewards.java @@ -0,0 +1,58 @@ +//============================================================================== +// +// Copyright (c) 2002- +// Authors: +// * Dave Parker (University of Birmingham/Oxford) +// +//------------------------------------------------------------------------------ +// +// This file is part of PRISM. +// +// PRISM is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// PRISM is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with PRISM; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//============================================================================== + +package explicit.rewards; + + +/** + * Explicit-state representation of a DTMC rewards structure, constructed (implicitly) + * from an MDP rewards structure and a memoryless deterministic strategy, specified as an array of integer indices. + * This class is read-only: most of data is pointers to other model info. + */ +public class MCRewardsFromMDPRewards implements MCRewards +{ + // MDP rewards + protected MDPRewards mdpRewards; + // Strategy (array of choice indices; -1 denotes no choice) + protected int strat[]; + + /** + * Constructor: create from MDP rewards and memoryless adversary. + */ + public MCRewardsFromMDPRewards(MDPRewards mdpRewards, int strat[]) + { + this.mdpRewards = mdpRewards; + this.strat = strat; + } + + @Override + public double getStateReward(int s) + { + // For now, state/transition rewards from MDP are both put into state reward + // This works fine for cumulative rewards, but not instantaneous ones + return mdpRewards.getStateReward(s) + mdpRewards.getTransitionReward(s, strat[s]); + } +}