Skip to content

Commit

Permalink
new code for smaller dataset & multiple bernoullis
Browse files Browse the repository at this point in the history
  • Loading branch information
davidzhu27 committed Mar 31, 2024
1 parent a1afd8d commit 09665cf
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 12 deletions.
17 changes: 10 additions & 7 deletions algorithms/offline/cql.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from torch.distributions import Normal, TanhTransform, TransformedDistribution

from pbrl import scale_rewards, generate_pbrl_dataset, make_latent_reward_dataset, train_latent, predict_and_label_latent_reward
from pbrl import label_by_trajectory_reward, generate_pbrl_dataset_no_overlap
from pbrl import label_by_trajectory_reward, generate_pbrl_dataset_no_overlap, small_d4rl_dataset
from pbrl import label_by_trajectory_reward_multiple_bernoullis

TensorBatch = List[torch.Tensor]

Expand All @@ -28,8 +29,8 @@
class TrainConfig:
# Experiment
device: str = "cuda"
env: str = "hopper-medium-expert-v2" # OpenAI gym environment name
seed: int = 2 # Sets Gym, PyTorch and Numpy seeds
env: str = "halfcheetah-expert-v2" # OpenAI gym environment name
seed: int = 6 # Sets Gym, PyTorch and Numpy seeds
eval_freq: int = int(5e3) # How often (time steps) we evaluate
n_episodes: int = 10 # How many episodes run during evaluation
max_timesteps: int = int(1e6) # Max time steps to run environment
Expand Down Expand Up @@ -75,7 +76,7 @@ class TrainConfig:
name: str = "CQL"

def __post_init__(self):
self.name = f"{self.name}-{self.env}-{'original'}"
self.name = f"{self.name}-{self.env}-{'original_smaller'}"
if self.checkpoints_path is not None:
self.checkpoints_path = os.path.join(self.checkpoints_path, self.name)

Expand Down Expand Up @@ -169,8 +170,8 @@ def sample(self, batch_size: int) -> TensorBatch:
# len_t = 20
# indices_of_traj = np.random.randint(0, num_t*2, size=batch_size//len_t)
# indices = np.array([np.arange(i*len_t, (i+1)*len_t) for i in indices_of_traj]).flatten()

indices = np.random.randint(0, min(self._size, self._pointer), size=batch_size)

states = self._states[indices]
actions = self._actions[indices]
rewards = self._rewards[indices]
Expand Down Expand Up @@ -880,14 +881,16 @@ def train(config: TrainConfig):

# print(len(dataset['rewards']))
# print(sum(dataset['terminals']))
dataset = small_d4rl_dataset(dataset=dataset, n_states=200000)
# dataset = scale_rewards(dataset)
# num_t = 5000
# len_t = 20
# pbrl_dataset = generate_pbrl_dataset_no_overlap(dataset, pbrl_dataset_file_path=f'CORL/saved/pbrl_datasets/pbrl_dataset_no_overlap_{config.env}_{num_t}_{len_t}.npz', num_t=num_t, len_t=len_t)
# latent_reward_model, indices = train_latent(dataset, pbrl_dataset, num_t=num_t, len_t=len_t)
# latent_reward_model, indices = train_latent(dataset, pbrl_dataset, cross_entropy=True, num_t=num_t, len_t=len_t)
# dataset = predict_and_label_latent_reward(dataset, latent_reward_model, indices)
# dataset = label_by_trajectory_reward(dataset, pbrl_dataset, num_t=num_t)

# dataset = label_by_trajectory_reward_multiple_bernoullis(dataset, pbrl_dataset, num_t=num_t)

replay_buffer.load_d4rl_dataset(dataset)

max_action = float(env.action_space.high[0])
Expand Down
66 changes: 61 additions & 5 deletions algorithms/offline/pbrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def label_by_trajectory_reward(dataset, pbrl_dataset, num_t, len_t=20):
# double checking
t1s, t2s, ps = pbrl_dataset
sampled = np.random.randint(low=0, high=num_t, size=(num_t,))
# print(np.max(t1s))
t1s_indices = t1s[sampled].flatten()
t2s_indices = t2s[sampled].flatten()
# t1s_indices = t1s.flatten()
Expand Down Expand Up @@ -102,8 +103,9 @@ def forward(self, x):
latent_reward_X : (2 * N * num_t * len_t , 23)
mus : (2 * N * num_t * len_t, 1)
"""
def make_latent_reward_dataset(dataset, pbrl_dataset, num_t, len_t=20):
def make_latent_reward_dataset(dataset, pbrl_dataset, num_t, len_t=20, num_trials =1):
t1s, t2s, ps = pbrl_dataset
# print(t1s.shape, np.max(t1s))
indices = torch.randint(high=num_t, size=(num_t,))
t1s_sample = t1s[indices]
t2s_sample = t2s[indices]
Expand All @@ -116,13 +118,18 @@ def make_latent_reward_dataset(dataset, pbrl_dataset, num_t, len_t=20):
act_values = acts[indices]
latent_reward_X = np.concatenate((obs_values, act_values), axis=1)

mus = torch.bernoulli(torch.from_numpy(ps_sample)).long()
return torch.tensor(latent_reward_X), mus, indices
mus = torch.zeros_like(torch.from_numpy(ps_sample))
for _ in range(num_trials):
mus += torch.bernoulli(torch.from_numpy(ps_sample)).long()
mus /= num_trials
return torch.tensor(latent_reward_X), mus.to(torch.long), indices


def train_latent(dataset, pbrl_dataset, num_t, len_t,
def train_latent(dataset, pbrl_dataset, cross_entropy, num_t, len_t,
n_epochs = 1000, patience=5, model_file_path=""):
X, mus, indices = make_latent_reward_dataset(dataset, pbrl_dataset, num_t=num_t, len_t=len_t)
if not cross_entropy:
X, mus, indices = make_latent_reward_dataset(dataset, pbrl_dataset, num_t=num_t, len_t=len_t, num_trials=10)
dim = dataset['observations'].shape[1] + dataset['actions'].shape[1]
# if os.path.exists(model_file_path):
# print(f'model successfully loaded from {model_file_path}')
Expand All @@ -133,6 +140,10 @@ def train_latent(dataset, pbrl_dataset, num_t, len_t,
assert((num_t * 2 * len_t, dim) == X.shape)
model = LatentRewardModel(input_dim=dim)
criterion = nn.CrossEntropyLoss()
if not cross_entropy:
def bernoulli_kl(p_pred, p_true):
return torch.sum(p_pred * torch.log(p_pred / p_true) + (1 - p_pred) * torch.log((1-p_pred)/(1-p_true)))
criterion = bernoulli_kl
optimizer = optim.Adam(model.parameters(), lr=0.001)
best_loss = float('inf')
current_patience = 0
Expand Down Expand Up @@ -223,13 +234,15 @@ def generate_pbrl_dataset_no_overlap(dataset, num_t, len_t, pbrl_dataset_file_pa
if pbrl_dataset_file_path != "" and os.path.exists(pbrl_dataset_file_path):
pbrl_dataset = np.load(pbrl_dataset_file_path)
print(f"pbrl_dataset loaded successfully from {pbrl_dataset_file_path}")
# print('max t1', np.max(pbrl_dataset['t1s']))
return (pbrl_dataset['t1s'], pbrl_dataset['t2s'], pbrl_dataset['ps'])
else:
# assuming no terminal states
t1s = np.zeros((num_t, len_t), dtype=int)
t2s = np.zeros((num_t, len_t), dtype=int)
ps = np.zeros(num_t)
starting_indices = list(range(0, len(dataset['observations']), len_t))
starting_indices = list(range(0, len(dataset['observations'])-len_t+1, len_t))
# print(len(starting_indices))
for i in range(num_t):
t1, r1 = pick_and_calc_reward(dataset, starting_indices, len_t)
t2, r2 = pick_and_calc_reward(dataset, starting_indices, len_t)
Expand All @@ -242,8 +255,51 @@ def generate_pbrl_dataset_no_overlap(dataset, num_t, len_t, pbrl_dataset_file_pa
return (t1s, t2s, ps)

def pick_and_calc_reward(dataset, starting_indices, len_t):
# print(len(starting_indices))
n0 = random.choice(starting_indices)
starting_indices.remove(n0)
ns = np.array(np.arange(n0, n0+len_t))
r = np.sum(dataset['rewards'][n0:n0+len_t])
return ns, r

def small_d4rl_dataset(dataset, n_states):
smaller = dataset.copy()
smaller['observations'] = smaller['observations'][:n_states]
smaller['actions'] = smaller['actions'][:n_states]
smaller['next_observations'] = smaller['next_observations'][:n_states]
smaller['rewards'] = smaller['rewards'][:n_states]
smaller['terminals'] = smaller['terminals'][:n_states]
return smaller


def label_by_trajectory_reward_multiple_bernoullis(dataset, pbrl_dataset, num_t, len_t=20):
# double checking
t1s, t2s, ps = pbrl_dataset
sampled = np.random.randint(low=0, high=num_t, size=(num_t,))
t1s_indices = t1s[sampled].flatten()
t2s_indices = t2s[sampled].flatten()
# t1s_indices = t1s.flatten()
# t2s_indices = t2s.flatten()
ps_sample = ps[sampled]
mus = multiple_bernoulli_trials_one_neg_one(ps_sample, num_trials=10)
repeated_mus = np.repeat(mus, len_t)

sampled_dataset = dataset.copy()
sampled_dataset['rewards'] = np.array(sampled_dataset['rewards'])
sampled_dataset['rewards'][t1s_indices] = repeated_mus
sampled_dataset['rewards'][t2s_indices] = -1 * repeated_mus

all_indices = np.concatenate([t1s_indices, t2s_indices])
sampled_dataset['observations'] = sampled_dataset['observations'][all_indices]
sampled_dataset['actions'] = sampled_dataset['actions'][all_indices]
sampled_dataset['next_observations'] = sampled_dataset['next_observations'][all_indices]
sampled_dataset['rewards'] = sampled_dataset['rewards'][all_indices]
sampled_dataset['terminals'] = sampled_dataset['terminals'][all_indices]

return sampled_dataset

def multiple_bernoulli_trials_one_neg_one(p, num_trials):
mus = 0
for _ in range(num_trials):
mus += torch.bernoulli(torch.from_numpy(p)).numpy()
return -1 + 2 * (mus / num_trials)
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 09665cf

Please sign in to comment.