From 6073712dc9137acdf2babaf3d2c2c36e8081aeaf Mon Sep 17 00:00:00 2001 From: Dave Parker Date: Sun, 15 Dec 2013 02:09:28 +0000 Subject: [PATCH] Bugfix in policy iteration (bug found by Aaron Bohy): needs to be passed in strategy from Prob1. git-svn-id: https://www.prismmodelchecker.org/svn/prism/prism/trunk@7707 bbc10eb1-c90d-0410-af57-cb519fbb1720 --- prism/src/explicit/MDPModelChecker.java | 36 ++++++------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/prism/src/explicit/MDPModelChecker.java b/prism/src/explicit/MDPModelChecker.java index 68e39d9e..dcdc9c3a 100644 --- a/prism/src/explicit/MDPModelChecker.java +++ b/prism/src/explicit/MDPModelChecker.java @@ -1423,7 +1423,7 @@ public class MDPModelChecker extends ProbModelChecker // If required, create/initialise strategy storage // Set choices to -1, denoting unknown // (except for target states, which are -2, denoting arbitrary) - if (genStrat || exportAdv) { + if (genStrat || exportAdv || mdpSolnMethod == MDPSolnMethod.POLICY_ITERATION) { strat = new int[n]; for (i = 0; i < n; i++) { strat[i] = target.get(i) ? -2 : -1; @@ -1432,17 +1432,17 @@ public class MDPModelChecker extends ProbModelChecker // Precomputation (not optional) timerProb1 = System.currentTimeMillis(); - inf = prob1(mdp, null, target, !min, null); + inf = prob1(mdp, null, target, !min, strat); inf.flip(0, n); timerProb1 = System.currentTimeMillis() - timerProb1; - + // Print results of precomputation numTarget = target.cardinality(); numInf = inf.cardinality(); mainLog.println("target=" + numTarget + ", inf=" + numInf + ", rest=" + (n - (numTarget + numInf))); // If required, generate strategy for "inf" states. - if (genStrat || exportAdv) { + if (genStrat || exportAdv || mdpSolnMethod == MDPSolnMethod.POLICY_ITERATION) { if (min) { // If min reward is infinite, all choices give infinity // So the choice can be arbitrary, denoted by -2; @@ -1692,7 +1692,10 @@ public class MDPModelChecker extends ProbModelChecker /** * Compute expected reachability rewards using policy iteration. - * Optionally, store optimal (memoryless) strategy info. + * The array {@code strat} is used both to pass in the initial strategy for policy iteration, + * and as storage for the resulting optimal strategy (if needed). + * Passing in an initial strategy is required when some states have infinite reward, + * to avoid the possibility of policy iteration getting stuck on an infinite-value strategy. * @param mdp The MDP * @param mdpRewards The rewards * @param target Target states @@ -1742,29 +1745,6 @@ public class MDPModelChecker extends ProbModelChecker for (i = 0; i < n; i++) strat[i] = 0; } - // Otherwise, just initialise for states not in target/inf - // (Optimal choices for target/inf should already be known) - else { - for (i = 0; i < n; i++) - if (!(target.get(i) || inf.get(i))) - strat[i] = 0; - } - // For minimum rewards, we need to make sure that initial strategy choices - // do not result in infinite rewards for any states that are know not to be infinite - // (otherwise policy iteration may not converge) - if (min) { - for (i = 0; i < n; i++) { - if (!(target.get(i) || inf.get(i))) { - int numChoices = mdp.getNumChoices(i); - for (int k = 0; k < numChoices; k++) { - if (!mdp.someSuccessorsInSet(i, k, inf)) { - strat[i] = k; - continue; - } - } - } - } - } // Start iterations iters = totalIters = 0;