GFNOrg · josephdviviano · Apr 2, 2024 · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024
diff --git a/src/gfn/containers/replay_buffer.py b/src/gfn/containers/replay_buffer.py
@@ -3,6 +3,8 @@
 import os
 from typing import TYPE_CHECKING, Literal
 
+import torch
+
 from gfn.containers.trajectories import Trajectories
 from gfn.containers.transitions import Transitions
 
@@ -48,6 +50,7 @@ def __init__(
         elif objects_type == "states":
             self.training_objects = env.states_from_batch_shape((0,))
             self.terminating_states = env.states_from_batch_shape((0,))
+            self.terminating_states.log_rewards = torch.zeros((0,), device=env.device)
             self.objects_type = "states"
         else:
             raise ValueError(f"Unknown objects_type: {objects_type}")

diff --git a/src/gfn/containers/trajectories.py b/src/gfn/containers/trajectories.py
@@ -54,7 +54,7 @@ def __init__(
         is_backward: bool = False,
         log_rewards: TT["n_trajectories", torch.float] | None = None,
         log_probs: TT["max_length", "n_trajectories", torch.float] | None = None,
-        estimator_outputs: torch.Tensor | None = None,
+        estimator_outputs: TT["batch_shape", "output_dim", torch.float] | None = None,
     ) -> None:
         """
         Args:
@@ -325,7 +325,11 @@ def to_transitions(self) -> Transitions:
                 ],
                 dim=0,
             )
-        log_probs = self.log_probs[~self.actions.is_dummy]
+        log_probs = (
+            self.log_probs[~self.actions.is_dummy]
+            if self.log_probs is not None and self.log_probs.nelement() > 0
+            else None
+        )
         return Transitions(
             env=self.env,
             states=states,

diff --git a/src/gfn/containers/transitions.py b/src/gfn/containers/transitions.py
@@ -186,7 +186,11 @@ def __getitem__(self, index: int | Sequence[int]) -> Transitions:
         log_rewards = (
             self._log_rewards[index] if self._log_rewards is not None else None
         )
-        log_probs = self.log_probs[index]
+        log_probs = (
+            self.log_probs[index]
+            if self.log_probs is not None and self.log_probs.nelement() > 0
+            else None
+        )
         return Transitions(
             env=self.env,
             states=states,

diff --git a/src/gfn/env.py b/src/gfn/env.py
@@ -393,7 +393,7 @@ class DiscreteEnvStates(DiscreteStates):
 
     def make_actions_class(self) -> type[Actions]:
         env = self
-        n_actions = self.n_actions
+        self.n_actions
 
         class DiscreteEnvActions(Actions):
             action_shape = env.action_shape

diff --git a/src/gfn/gflownet/base.py b/src/gfn/gflownet/base.py
@@ -29,14 +29,20 @@ class GFlowNet(ABC, nn.Module, Generic[TrainingSampleType]):
 
     @abstractmethod
     def sample_trajectories(
-        self, env: Env, n_samples: int, sample_off_policy: bool
+        self,
+        env: Env,
+        n_samples: int,
+        save_logprobs: bool = True,
+        save_estimator_outputs: bool = False,
     ) -> Trajectories:
         """Sample a specific number of complete trajectories.
 
         Args:
             env: the environment to sample trajectories from.
             n_samples: number of trajectories to be sampled.
-            sample_off_policy: whether to sample trajectories on / off policy.
+            save_logprobs: whether to save the logprobs of the actions - useful for on-policy learning.
+            save_estimator_outputs: whether to save the estimator outputs - useful for off-policy learning
+                        with tempered policy
         Returns:
             Trajectories: sampled trajectories object.
         """
@@ -50,7 +56,9 @@ def sample_terminating_states(self, env: Env, n_samples: int) -> States:
         Returns:
             States: sampled terminating states object.
         """
-        trajectories = self.sample_trajectories(env, n_samples, sample_off_policy=False)
+        trajectories = self.sample_trajectories(
+            env, n_samples, save_estimator_outputs=False, save_logprobs=False
+        )
         return trajectories.last_states
 
     def logz_named_parameters(self):
@@ -76,21 +84,26 @@ class PFBasedGFlowNet(GFlowNet[TrainingSampleType]):
         pb: GFNModule
     """
 
-    def __init__(self, pf: GFNModule, pb: GFNModule, off_policy: bool):
+    def __init__(self, pf: GFNModule, pb: GFNModule):
         super().__init__()
         self.pf = pf
         self.pb = pb
-        self.off_policy = off_policy
 
     def sample_trajectories(
-        self, env: Env, n_samples: int, sample_off_policy: bool, **policy_kwargs
+        self,
+        env: Env,
+        n_samples: int,
+        save_logprobs: bool = True,
+        save_estimator_outputs: bool = False,
+        **policy_kwargs
     ) -> Trajectories:
         """Samples trajectories, optionally with specified policy kwargs."""
         sampler = Sampler(estimator=self.pf)
         trajectories = sampler.sample_trajectories(
             env,
             n_trajectories=n_samples,
-            off_policy=sample_off_policy,
+            save_estimator_outputs=save_estimator_outputs,
+            save_logprobs=save_logprobs,
             **policy_kwargs,
         )
 
@@ -108,6 +121,7 @@ def get_pfs_and_pbs(
         self,
         trajectories: Trajectories,
         fill_value: float = 0.0,
+        recalculate_all: bool = False,
     ) -> Tuple[
         TT["max_length", "n_trajectories", torch.float],
         TT["max_length", "n_trajectories", torch.float],
@@ -117,17 +131,16 @@ def get_pfs_and_pbs(
         More specifically it evaluates $\log P_F (s' \mid s)$ and $\log P_B(s \mid s')$
         for each transition in each trajectory in the batch.
 
-        Useful when the policy used to sample the trajectories is different from
-        the one used to evaluate the loss. Otherwise we can use the logprobs directly
-        from the trajectories.
-
-        Note - for off policy exploration, the trajectories submitted to this method
-        will be sampled off policy.
+        Unless recalculate_all=True, in which case we re-evaluate the logprobs of the trajectories with
+        the current self.pf. The following applies:
+            - If trajectories have log_probs attribute, use them - this is usually for on-policy learning
+            - Else, if trajectories have estimator_outputs attribute, transform them
+                into log_probs - this is usually for off-policy learning with a tempered policy
+            - Else, if trajectories have none of them, re-evaluate the log_probs
+                using the current self.pf - this is usually for off-policy learning with replay buffer
 
         Args:
             trajectories: Trajectories to evaluate.
-            estimator_outputs: Optional stored estimator outputs from previous forward
-                sampling (encountered, for example, when sampling off policy).
             fill_value: Value to use for invalid states (i.e. $s_f$ that is added to
                 shorter trajectories).
 
@@ -151,16 +164,19 @@ def get_pfs_and_pbs(
         if valid_states.batch_shape != tuple(valid_actions.batch_shape):
             raise AssertionError("Something wrong happening with log_pf evaluations")
 
-        if self.off_policy:
-            # We re-use the values calculated in .sample_trajectories().
-            if trajectories.estimator_outputs is not None:
+        if (
+            trajectories.log_probs is not None
+            and trajectories.log_probs.nelement() > 0
+            and not recalculate_all
+        ):
+            log_pf_trajectories = trajectories.log_probs
+        else:
+            if trajectories.estimator_outputs is not None and not recalculate_all:
                 estimator_outputs = trajectories.estimator_outputs[
                     ~trajectories.actions.is_dummy
                 ]
             else:
-                raise Exception(
-                    "GFlowNet is off policy, but no estimator_outputs found in Trajectories!"
-                )
+                estimator_outputs = self.pf(valid_states)
 
             # Calculates the log PF of the actions sampled off policy.
             valid_log_pf_actions = self.pf.to_probability_distribution(
@@ -175,9 +191,6 @@ def get_pfs_and_pbs(
             )
             log_pf_trajectories[~trajectories.actions.is_dummy] = valid_log_pf_actions
 
-        else:
-            log_pf_trajectories = trajectories.log_probs
-
         non_initial_valid_states = valid_states[~valid_states.is_initial_state]
         non_exit_valid_actions = valid_actions[~valid_actions.is_exit]
 
@@ -201,13 +214,19 @@ def get_pfs_and_pbs(
 
         return log_pf_trajectories, log_pb_trajectories
 
-    def get_trajectories_scores(self, trajectories: Trajectories) -> Tuple[
+    def get_trajectories_scores(
+        self,
+        trajectories: Trajectories,
+        recalculate_all: bool = False,
+    ) -> Tuple[
         TT["n_trajectories", torch.float],
         TT["n_trajectories", torch.float],
         TT["n_trajectories", torch.float],
     ]:
         """Given a batch of trajectories, calculate forward & backward policy scores."""
-        log_pf_trajectories, log_pb_trajectories = self.get_pfs_and_pbs(trajectories)
+        log_pf_trajectories, log_pb_trajectories = self.get_pfs_and_pbs(
+            trajectories, recalculate_all=recalculate_all
+        )
 
         assert log_pf_trajectories is not None
         total_log_pf_trajectories = log_pf_trajectories.sum(dim=0)

diff --git a/src/gfn/gflownet/detailed_balance.py b/src/gfn/gflownet/detailed_balance.py
@@ -23,7 +23,6 @@ class DBGFlowNet(PFBasedGFlowNet[Transitions]):
 
     Attributes:
         logF: a ScalarEstimator instance.
-        off_policy: If true, we need to reevaluate the log probs.
         forward_looking: whether to implement the forward looking GFN loss.
         log_reward_clip_min: If finite, clips log rewards to this value.
     """
@@ -33,16 +32,17 @@ def __init__(
         pf: GFNModule,
         pb: GFNModule,
         logF: ScalarEstimator,
-        off_policy: bool,
         forward_looking: bool = False,
         log_reward_clip_min: float = -float("inf"),
     ):
-        super().__init__(pf, pb, off_policy=off_policy)
+        super().__init__(pf, pb)
         self.logF = logF
         self.forward_looking = forward_looking
         self.log_reward_clip_min = log_reward_clip_min
 
-    def get_scores(self, env: Env, transitions: Transitions) -> Tuple[
+    def get_scores(
+        self, env: Env, transitions: Transitions, recalculate_all: bool = False
+    ) -> Tuple[
         TT["n_transitions", float],
         TT["n_transitions", float],
         TT["n_transitions", float],
@@ -52,6 +52,12 @@ def get_scores(self, env: Env, transitions: Transitions) -> Tuple[
         Args:
             transitions: a batch of transitions.
 
+        Unless recalculate_all=True, in which case we re-evaluate the logprobs of the transitions with
+        the current self.pf. The following applies:
+            - If transitions have log_probs attribute, use them - this is usually for on-policy learning
+            - Else, re-evaluate the log_probs using the current self.pf - this is usually for
+              off-policy learning with replay buffer
+
         Raises:
             ValueError: when supplied with backward transitions.
             AssertionError: when log rewards of transitions are None.
@@ -66,19 +72,20 @@ def get_scores(self, env: Env, transitions: Transitions) -> Tuple[
 
         if states.batch_shape != tuple(actions.batch_shape):
             raise ValueError("Something wrong happening with log_pf evaluations")
-        if not self.off_policy:
+        if (
+            transitions.log_probs is not None
+            and transitions.log_probs.nelement() > 0
+            and not recalculate_all
+        ):
             valid_log_pf_actions = transitions.log_probs
         else:
-            # Evaluate the log PF of the actions sampled off policy.
-            # I suppose the Transitions container should then have some
-            # estimator_outputs attribute as well, to avoid duplication here ?
-            # See (#156).
-            module_output = self.pf(states)  # TODO: Inefficient duplication.
+            # Evaluate the log PF of the actions
+            module_output = self.pf(
+                states
+            )  # TODO: Inefficient duplication in case of tempered policy
             valid_log_pf_actions = self.pf.to_probability_distribution(
                 states, module_output
-            ).log_prob(
-                actions.tensor
-            )  # Actions sampled off policy.
+            ).log_prob(actions.tensor)
 
         valid_log_F_s = self.logF(states).squeeze(-1)
         if self.forward_looking:
@@ -147,9 +154,17 @@ class ModifiedDBGFlowNet(PFBasedGFlowNet[Transitions]):
     https://arxiv.org/abs/2202.13903 for more details.
     """
 
-    def get_scores(self, transitions: Transitions) -> TT["n_trajectories", torch.float]:
+    def get_scores(
+        self, transitions: Transitions, recalculate_all: bool = False
+    ) -> TT["n_trajectories", torch.float]:
         """DAG-GFN-style detailed balance, when all states are connected to the sink.
 
+        Unless recalculate_all=True, in which case we re-evaluate the logprobs of the transitions with
+        the current self.pf. The following applies:
+            - If transitions have log_probs attribute, use them - this is usually for on-policy learning
+            - Else, re-evaluate the log_probs using the current self.pf - this is usually for
+              off-policy learning with replay buffer
+
         Raises:
             ValueError: when backward transitions are supplied (not supported).
             ValueError: when the computed scores contain `inf`.
@@ -164,7 +179,11 @@ def get_scores(self, transitions: Transitions) -> TT["n_trajectories", torch.flo
         all_log_rewards = transitions.all_log_rewards[mask]
         module_output = self.pf(states)
         pf_dist = self.pf.to_probability_distribution(states, module_output)
-        if not self.off_policy:
+        if (
+            transitions.log_probs is not None
+            and transitions.log_probs.nelement() > 0
+            and not recalculate_all
+        ):
             valid_log_pf_actions = transitions[mask].log_probs
         else:
             # Evaluate the log PF of the actions sampled off policy.

diff --git a/src/gfn/gflownet/flow_matching.py b/src/gfn/gflownet/flow_matching.py
@@ -36,7 +36,8 @@ def __init__(self, logF: DiscretePolicyEstimator, alpha: float = 1.0):
     def sample_trajectories(
         self,
         env: Env,
-        off_policy: bool,
+        save_logprobs: bool,
+        save_estimator_outputs: bool = False,
         n_samples: int = 1000,
         **policy_kwargs: Optional[dict],
     ) -> Trajectories:
@@ -49,7 +50,8 @@ def sample_trajectories(
         trajectories = sampler.sample_trajectories(
             env,
             n_trajectories=n_samples,
-            off_policy=off_policy,
+            save_estimator_outputs=save_estimator_outputs,
+            save_logprobs=save_logprobs,
             **policy_kwargs,
         )
         return trajectories

diff --git a/src/gfn/gflownet/sub_trajectory_balance.py b/src/gfn/gflownet/sub_trajectory_balance.py
@@ -56,7 +56,6 @@ def __init__(
         pf: GFNModule,
         pb: GFNModule,
         logF: ScalarEstimator,
-        off_policy: bool,
         weighting: Literal[
             "DB",
             "ModifiedDB",
@@ -70,7 +69,7 @@ def __init__(
         log_reward_clip_min: float = -float("inf"),
         forward_looking: bool = False,
     ):
-        super().__init__(pf, pb, off_policy=off_policy)
+        super().__init__(pf, pb)
         self.logF = logF
         self.weighting = weighting
         self.lamda = lamda