added x_prob() and x_val() functions to access individual parts of th…

…e matrices. we use not internally NA to represent * in the POMDP definition. actions, states and observations are now factors in most places.
mhahsler · May 16, 2022 · 8639f1a · 8639f1a
1 parent a1513af
commit 8639f1a
Show file tree

Hide file tree

Showing 30 changed files with 587 additions and 233 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -48,6 +48,7 @@ Collate:
     'reward.R'
     'round_stochchastic.R'
     'sample_belief_space.R'
+    'simulate_MDP.R'
     'simulate_POMDP.R'
     'solve_MDP.R'
     'solve_POMDP.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -11,8 +11,8 @@ export(R_)
 export(T_)
 export(approx_MDP_policy_evaluation)
 export(estimate_belief_for_nodes)
-export(observation_function)
 export(observation_matrix)
+export(observation_prob)
 export(optimal_action)
 export(plot_belief_space)
 export(plot_policy_graph)
@@ -23,18 +23,19 @@ export(q_values_MDP)
 export(random_MDP_policy)
 export(read_POMDP)
 export(reward)
-export(reward_function)
 export(reward_matrix)
 export(reward_node_action)
+export(reward_val)
 export(round_stochastic)
 export(sample_belief_space)
+export(simulate_MDP)
 export(simulate_POMDP)
 export(solve_MDP)
 export(solve_POMDP)
 export(solve_POMDP_parameter)
 export(solve_SARSOP)
-export(transition_function)
 export(transition_matrix)
+export(transition_prob)
 export(update_belief)
 export(write_POMDP)
 import(graphics)

diff --git a/NEWS.md b/NEWS.md
@@ -5,8 +5,10 @@
 * reward() and reward_node_action() have now been separated.
 * sample_belief_space() gained method 'trajectories'.
 * simulate_POMDP(): supports not epsilon-greedy policies.
-* observation_matrix() et al. functions are now created with a separate function ending in _function.
+* added x_prob() and x_val() functions to access individual parts of the matrices.
 * fixed converged finite-horizon case. It now only returns the converged graph/alpha.
+* we use not internally NA to represent * in the POMDP definition.
+* actions, states and observations are now factors in most places.
 
 # pomdp 1.0.1 (03/25/2022)
 

diff --git a/R/MDP.R b/R/MDP.R
@@ -13,6 +13,9 @@
 #'
 #' More details on specifying the model components can be found in the documentation
 #' for [POMDP].
+#'
+#' @family MDP
+#'
 #' @include POMDP.R
 #' @param states a character vector specifying the names of the states.
 #' @param actions a character vector specifying the names of the available
@@ -103,7 +106,35 @@ MDP <- function(states,
 }
 
 #' @export
-print.MDP <- print.POMDP
+print.MDP <- function(x, ...) {
+  writeLines(paste(paste(class(x), collapse = ", "),
+    "-",
+    x$name))
+
+  if (!is.null(x$discount))
+    writeLines(sprintf("  Discount factor: %s",
+      paste(x$discount, collapse = "+")))
+
+  if (!is.null(x$horizon))
+    writeLines(sprintf("  Horizon: %s epochs",
+      paste(x$horizon, collapse = " + ")))
+
+  if (.solved_MDP(x))
+    writeLines(c(
+      "  Solved:",
+      sprintf("    Solution converged: %s",
+        x$solution$converged)
+      )
+    )
+
+  writeLines(strwrap(
+    paste("List components:", paste(sQuote(names(
+      x
+    )), collapse = ", "), "\n"),
+    indent = 2,
+    exdent = 4
+  ))
+}
 
 #' @rdname MDP
 #' @export
@@ -117,14 +148,33 @@ MDP2POMDP <- function(x) {
   ident_matrix <- diag(length(x$states))
   dimnames(ident_matrix) <- list(x$states, x$observations)
 
-  x$observation_prob <- list('*' = ident_matrix)
+  x$observation_prob <- sapply(x$actions, FUN = function(x) ident_matrix, simplify = FALSE)
   class(x) <- c("MDP", "POMDP", "list")
   x
 }
 
-.solved_MDP <- function(x) {
+.solved_MDP <- function(x, stop = FALSE) {
   if (!inherits(x, "MDP"))
-    stop("x needs to be a POMDP object!")
-  if (is.null(x$solution))
+    stop("x needs to be a MDP object!")
+  solved <- !is.null(x$solution)
+  if (stop && !solved)
     stop("x needs to be a solved MDP. Use solve_MDP() first.")
+
+  solved
+}
+
+## this is .get_pg_index for MDPs
+.get_pol_index <- function(model, epoch) {
+
+  epoch <- as.integer(epoch)
+  if(epoch < 1L) stop("Epoch has to be >= 1")
+
+  ### (converged) infinite horizon POMDPs. We ignore epoch.
+  if (length(model$solution$policy) == 1L) return(1L)
+
+  ### regular epoch for finite/infinite horizon case
+  if (epoch > length(model$solution$policy)) 
+    stop("MDP model has only a policy up to epoch ", length(model$solution$policy))
+
+  return(epoch)
 }
diff --git a/R/Maze.R b/R/Maze.R
@@ -18,7 +18,7 @@
 #' The # (state `s_5`) in the middle of the maze is an obstruction and not reachable.  
 #' Rewards are associated with transitions. The default reward (penalty) is -0.04.
 #' Transitioning to + (state `s_12`) gives a reward of 1.0, transitioning to - (state `s_11`)
-#' has a reward of -1.0. States `s_11` and `s_12` are terminal states.
+#' has a reward of -1.0. States `s_11` and `s_12` are terminal (absorbing) states.
 #' 
 #' Actions are movements (`north`, `south`, `east`, `west`). The actions are unreliable with a .8 chance
 #' to move in the correct direction and a 0.1 chance to instead to move in a