diff --git a/readme.md b/README.md
similarity index 77%
rename from readme.md
rename to README.md
index 7010f727..01b1c053 100644
--- a/readme.md
+++ b/README.md
@@ -4,18 +4,27 @@
Apex is a small, modular library that contains some implementations of continuous reinforcement learning algorithms. Fully compatible with OpenAI gym.
+
+
+
## Running experiments
### Basics
Any algorithm can be run from the apex.py entry point.
-To run DDPG on Walker2d-v2,
+To run PPO on a cassie environment,
+
+```bash
+python apex.py ppo --env_name Cassie-v0 --num_procs 12 --run_name experiment01
+```
+
+To run TD3 on the gym environment Walker-v2,
```bash
-python apex.py ddpg --env_name Walker2d-v2 --batch_size 64
+python apex.py td3_async --env_name Walker-v2 --num_procs 12 --run_name experiment02
```
-### Logging details / Monitoring live training progress
+## Logging details / Monitoring live training progress
Tensorboard logging is enabled by default for all algorithms. The logger expects that you supply an argument named ```logdir```, containing the root directory you want to store your logfiles in, and an argument named ```seed```, which is used to seed the pseudorandom number generators.
A basic command line script illustrating this is:
@@ -27,9 +36,9 @@ python apex.py ars --logdir logs/ars --seed 1337
The resulting directory tree would look something like this:
```
trained_models/ # directory with all of the saved models and tensorboard logs
-└── td3_async # algorithm name
+└── ars # algorithm name
└── Cassie-v0 # environment name
- └── 8b8b12-seed1 # unique run name created with hash of hyperparameters + seed
+ └── 8b8b12-seed1 # unique run name created with hash of hyperparameters
├── actor.pt # actor network for algo
├── critic.pt # critic network for algo
├── events.out.tfevents # tensorboard binary file
@@ -41,17 +50,16 @@ Using tensorboard makes it easy to compare experiments and resume training later
To see live training progress
-Run ```$ tensorboard --logdir logs/ --port=8097``` then navigate to ```http://localhost:8097/``` in your browser
-
-## Unit tests
-You can run the unit tests using pytest.
+Run ```$ tensorboard --logdir logs/``` then navigate to ```http://localhost:6006/``` in your browser
-### To Do
-- [ ] Sphinx documentation and github wiki
-- [ ] Support loading pickle of hyperparameters for resuming training
-- [ ] Improve/Tune implementations of TD3, add capability to resume training for off policy algorithms
+## Cassie Environments:
+* `Cassie-v0` : basic unified environment for walking/running policies
+* `CassieTraj-v0` : unified environment with reference trajectories
+* `CassiePlayground-v0` : environment for executing autonomous missions
+* `CassieStanding-v0` : environment for training standing policies
-## Features:
+## Algorithms:
+#### Currently implemented:
* Parallelism with [Ray](https://github.com/ray-project/ray)
* [GAE](https://arxiv.org/abs/1506.02438)/TD(lambda) estimators
* [PPO](https://arxiv.org/abs/1707.06347), VPG with ratio objective and with log likelihood objective
diff --git a/apex.py b/apex.py
index 86b5adbb..9654a5e9 100644
--- a/apex.py
+++ b/apex.py
@@ -1,6 +1,8 @@
import torch
import sys, pickle, argparse
-from util import color, print_logo, env_factory, create_logger, EvalProcessClass, parse_previous, collect_data
+from util.logo import print_logo
+from util.log import parse_previous
+from util.eval import EvalProcessClass
if __name__ == "__main__":
@@ -10,31 +12,31 @@
"""
General arguments for configuring the environment
"""
+ # command input, state input, env attributes
+ parser.add_argument("--command_profile", default="clock", type=str.lower, choices=["clock", "phase", "traj"])
+ parser.add_argument("--input_profile", default="full", type=str.lower, choices=["full", "min"])
parser.add_argument("--simrate", default=50, type=int, help="simrate of environment")
- parser.add_argument("--traj", default="walking", type=str, help="reference trajectory to use. options are 'aslip', 'walking', 'stepping'")
- parser.add_argument("--phase_based", default=False, action='store_true', dest='phase_based')
- parser.add_argument("--not_clock_based", default=True, action='store_false', dest='clock_based')
- parser.add_argument("--not_state_est", default=True, action='store_false', dest='state_est')
parser.add_argument("--not_dyn_random", default=True, action='store_false', dest='dyn_random')
- parser.add_argument("--not_no_delta", default=True, action='store_false', dest='no_delta')
- parser.add_argument("--not_mirror", default=True, action='store_false', dest='mirror') # mirror actions or not
parser.add_argument("--learn_gains", default=False, action='store_true', dest='learn_gains') # learn PD gains or not
+ # attributes for trajectory based environments
+ parser.add_argument("--traj", default="walking", type=str, help="reference trajectory to use. options are 'aslip', 'walking', 'stepping'")
+ parser.add_argument("--not_no_delta", default=True, action='store_false', dest='no_delta')
parser.add_argument("--ik_baseline", default=False, action='store_true', dest='ik_baseline') # use ik as baseline for aslip + delta policies?
+ # mirror loss and reward
+ parser.add_argument("--not_mirror", default=True, action='store_false', dest='mirror') # mirror actions or not
parser.add_argument("--reward", default=None, type=str) # reward to use. this is a required argument
- # parser.add_argument("--gainsDivide", default=1.0, type=float)
"""
General arguments for configuring the logger
"""
+ parser.add_argument("--env_name", default="Cassie-v0") # environment name
parser.add_argument("--run_name", default=None) # run name
-
"""
- Arguments generally used for Curriculum Learning
+ General arguments for Curriculum Learning
"""
parser.add_argument("--exchange_reward", default=None) # Can only be used with previous (below)
parser.add_argument("--previous", type=str, default=None) # path to directory of previous policies for resuming training
- parser.add_argument("--fixed_speed", type=float, default=None) # Fixed speed to train/test at
if len(sys.argv) < 2:
print("Usage: python apex.py [option]", sys.argv)
@@ -59,7 +61,6 @@
parser.add_argument("--recurrent", "-r", action='store_true') # whether to use a recurrent policy
parser.add_argument("--logdir", default="./trained_models/ars/", type=str)
parser.add_argument("--seed", "-s", default=0, type=int)
- parser.add_argument("--env_name", "-e", default="Hopper-v3")
parser.add_argument("--average_every", default=10, type=int)
parser.add_argument("--save_model", "-m", default=None, type=str) # where to save the trained model to
parser.add_argument("--redis", default=None)
@@ -105,7 +106,6 @@
else:
parser.add_argument("--logdir", default="./trained_models/ddpg/", type=str)
parser.add_argument("--seed", "-s", default=0, type=int)
- parser.add_argument("--env_name", "-e", default="Hopper-v3")
args = parser.parse_args()
@@ -126,7 +126,6 @@
# general args
parser.add_argument("--logdir", default="./trained_models/syncTD3/", type=str)
parser.add_argument("--previous", type=str, default=None) # path to directory of previous policies for resuming training
- parser.add_argument("--env_name", default="Cassie-v0") # environment name
parser.add_argument("--history", default=0, type=int) # number of previous states to use as input
parser.add_argument("--redis_address", type=str, default=None) # address of redis server (for cluster setups)
parser.add_argument("--seed", default=0, type=int) # Sets Gym, PyTorch and Numpy seeds
@@ -168,7 +167,6 @@
from rl.algos.async_td3 import run_experiment
# args common for actors and learners
- parser.add_argument("--env_name", default="Cassie-v0") # environment name
parser.add_argument("--hidden_size", default=256) # neurons in hidden layer
parser.add_argument("--history", default=0, type=int) # number of previous states to use as input
@@ -223,8 +221,6 @@
from rl.algos.ppo import run_experiment
# general args
- parser.add_argument("--algo_name", default="ppo") # algo name
- parser.add_argument("--env_name", "-e", default="Cassie-v0")
parser.add_argument("--logdir", type=str, default="./trained_models/ppo/") # Where to log diagnostics to
parser.add_argument("--seed", default=0, type=int) # Sets Gym, PyTorch and Numpy seeds
parser.add_argument("--history", default=0, type=int) # number of previous states to use as input
@@ -252,13 +248,8 @@
parser.add_argument("--max_traj_len", type=int, default=400, help="Max episode horizon")
parser.add_argument("--recurrent", action='store_true')
parser.add_argument("--bounded", type=bool, default=False)
-
args = parser.parse_args()
- # Argument setup checks. Ideally all arg settings are compatible with each other, but that's not convenient for fast development
- if (args.ik_baseline and not args.traj == "aslip") or (args.learn_gains and args.mirror):
- raise Exception("Incompatible environment config settings")
-
args = parse_previous(args)
run_experiment(args)
@@ -267,8 +258,7 @@
sys.argv.remove(sys.argv[1])
- parser.add_argument("--path", type=str, default="./trained_models/ppo/Cassie-v0/7b7e24-seed0/", help="path to folder containing policy and run details")
- parser.add_argument("--env_name", default="Cassie-v0", type=str)
+ parser.add_argument("--path", type=str, default="./trained_models/nodelta_neutral_StateEst_symmetry_speed0-3_freq1-2/", help="path to folder containing policy and run details")
parser.add_argument("--traj_len", default=400, type=str)
parser.add_argument("--history", default=0, type=int) # number of previous states to use as input
parser.add_argument("--mission", default="default", type=str) # only used by playground environment
@@ -276,25 +266,15 @@
parser.add_argument("--debug", default=False, action='store_true')
parser.add_argument("--no_stats", dest="stats", default=True, action='store_false')
parser.add_argument("--no_viz", default=False, action='store_true')
- parser.add_argument("--collect_data", default=False, action='store_true')
args = parser.parse_args()
run_args = pickle.load(open(args.path + "experiment.pkl", "rb"))
- if not hasattr(run_args, "simrate"):
- run_args.simrate = 50
- print("manually choosing simrate as 50 (40 Hz)")
- if not hasattr(run_args, "phase_based"):
- run_args.phase_based = False
-
policy = torch.load(args.path + "actor.pt")
policy.eval()
- if args.collect_data:
- collect_data(policy, args, run_args)
- else:
- # eval_policy(policy, args, run_args)
- # eval_policy_input_viz(policy, args, run_args)
- ev = EvalProcessClass(args, run_args)
- ev.eval_policy(policy, args, run_args)
+ # eval_policy(policy, args, run_args)
+ # eval_policy_input_viz(policy, args, run_args)
+ ev = EvalProcessClass(args, run_args)
+ ev.eval_policy(policy, args, run_args)
diff --git a/cassie/__init__.py b/cassie/__init__.py
index bee375af..a63ec94f 100644
--- a/cassie/__init__.py
+++ b/cassie/__init__.py
@@ -1,8 +1,10 @@
+# Unified
+from .cassie import CassieEnv
+from .cassie_traj import CassieTrajEnv
from .cassie_playground import CassiePlayground
-from .cassie import CassieEnv_v2 as CassieEnv
-from .cassie_min import CassieEnv_v3 as CassieMinEnv
-from .cassie_standing_env import CassieStandingEnv
-from .cassie import CassieEnv_v2
+from .cassie_standing_env import CassieStandingEnv # sorta old/unused
+
+# Proprietary
from .cassie_noaccel_footdist_omniscient import CassieEnv_noaccel_footdist_omniscient
from .cassie_footdist_env import CassieEnv_footdist
from .cassie_noaccel_footdist_env import CassieEnv_noaccel_footdist
@@ -10,7 +12,7 @@
from .cassie_novel_footdist_env import CassieEnv_novel_footdist
from .cassie_mininput_env import CassieEnv_mininput
-
+# CassieMujocoSim
from .cassiemujoco import *
diff --git a/cassie/cassie.py b/cassie/cassie.py
index 89612bda..978e27b9 100644
--- a/cassie/cassie.py
+++ b/cassie/cassie.py
@@ -2,14 +2,13 @@
from .cassiemujoco import pd_in_t, state_out_t, CassieSim, CassieVis
-from .trajectory import *
from cassie.quaternion_function import *
from cassie.phase_function import *
from .rewards import *
from math import floor
-import numpy as np
+import numpy as np
import os
import random
import copy
@@ -25,57 +24,29 @@ def load_reward_clock_funcs(path):
return clock_funcs
-class CassieEnv_v2:
- def __init__(self, traj='walking', simrate=50, phase_based=False, clock_based=True, state_est=True, dynamics_randomization=True,
- no_delta=True, learn_gains=False, ik_baseline=False, reward="iros_paper",
+class CassieEnv:
+ def __init__(self, traj='walking', simrate=50, command_profile="clock", input_profile="full", dynamics_randomization=True,
+ learn_gains=False, reward="iros_paper",
config="./cassie/cassiemujoco/cassie.xml", history=0, **kwargs):
dirname = os.path.dirname(__file__)
- #xml_path = os.path.join(dirname, "cassiemujoco", "cassie.xml")
- #self.sim = CassieSim(xml_path)
self.config = config
self.sim = CassieSim(self.config)
# self.sim = CassieSim("./cassie/cassiemujoco/cassie_drop_step.xml")
self.vis = None
# Arguments for the simulation and state space
- self.phase_based = phase_based
- self.clock_based = clock_based
- self.state_est = state_est
- self.no_delta = no_delta
+ self.command_profile = command_profile
+ self.input_profile = input_profile
+ self.clock_based = True
self.dynamics_randomization = dynamics_randomization
- self.ik_baseline = ik_baseline
- assert(not (phase_based and clock_based))
-
- # Arguments generally used for curriculum training
+ # Arguments for reward function
self.reward_func = reward
self.early_term_cutoff = 0.3
- curriculum_defaults = {
- "fixed_speed" : None
- }
- for (arg, default) in curriculum_defaults.items():
- setattr(self, arg, kwargs.get(arg, default))
-
- # CONFIGURE REF TRAJECTORY to use
- if traj == "aslip":
- self.speeds = np.array([x / 10 for x in range(0, 21)])
- self.trajectories = getAllTrajectories(self.speeds)
- self.num_speeds = len(self.trajectories)
- self.traj_idx = 0
- self.speed = self.speeds[self.traj_idx]
- self.trajectory = self.trajectories[self.traj_idx]
- self.aslip_traj = True
- else:
- self.aslip_traj = False
- if traj == "walking":
- traj_path = os.path.join(dirname, "trajectory", "stepdata.bin")
- elif traj == "stepping":
- traj_path = os.path.join(dirname, "trajectory", "more-poses-trial.bin")
- self.trajectory = CassieTrajectory(traj_path)
- self.speed = 0
- self.observation_space, self.clock_inds, self.mirrored_obs = self.set_up_state_space()
+ # State space
+ self.observation_space, self.clock_inds, self.mirrored_obs = self.set_up_state_space(self.command_profile, self.input_profile)
# Adds option for state history for FF nets
self._obs = len(self.observation_space)
@@ -88,46 +59,41 @@ def __init__(self, traj='walking', simrate=50, phase_based=False, clock_based=Tr
# learn gains means there is a delta on the default PD gains ***FOR EACH LEG***
self.learn_gains = learn_gains
- if self.learn_gains == True:
+ if self.learn_gains:
self.action_space = np.zeros(10 + 20)
+ self.mirrored_acts = [-5, -6, 7, 8, 9, -0.1, -1, 2, 3, 4,
+ -15, -16, 17, 18, 19, -10, -11, 12, 13, 14,
+ -25, -26, 27, 28, 29, -20, -21, 22, 23, 24]
else:
self.action_space = np.zeros(10)
+ self.mirrored_acts = [-5, -6, 7, 8, 9, -0.1, -1, 2, 3, 4]
self.u = pd_in_t()
# TODO: should probably initialize this to current state
self.cassie_state = state_out_t()
-
- self.simrate = simrate # simulate X mujoco steps with same pd target
- # 50 brings simulation from 2000Hz to exactly 40Hz
-
- self.simsteps = 0 # number of simulation steps
-
- self.time = 0 # number of time steps in current episode
- self.phase = 0 # portion of the phase the robot is in
- self.counter = 0 # number of phase cycles completed in episode
+ self.simrate = simrate # simulate X mujoco steps with same pd target. 50 brings simulation from 2000Hz to exactly 40Hz
+ self.time = 0 # number of time steps in current episode
+ self.phase = 0 # portion of the phase the robot is in
+ self.counter = 0 # number of phase cycles completed in episode
# NOTE: a reference trajectory represents ONE phase cycle
- # phase function variables
- self.swing_duration = 0.15
- self.stance_duration = 0.25
- self.stance_mode = "zero"
- # constants
- self.strict_relaxer = 0.1
- self.have_incentive = False if "no_incentive" in self.reward_func else True
+ # should be floor(len(traj) / simrate) - 1
+ # should be VERY cautious here because wrapping around trajectory
+ # badly can cause assymetrical/bad gaits
+ self.phaselen = 32
+ self.phase_add = 1
- if not self.phase_based:
- # should be floor(len(traj) / simrate) - 1
- # should be VERY cautious here because wrapping around trajectory
- # badly can cause assymetrical/bad gaits
- self.phaselen = floor(len(self.trajectory) / self.simrate) - 1 if not self.aslip_traj else self.trajectory.length - 1
+ # NOTE: phase_based modifies self.phaselen throughout training
# Set up phase based / load in clock based reward func
+ self.have_incentive = False if "no_incentive" in self.reward_func else True
+ self.strict_relaxer = 0.1
self.early_reward = False
- if self.phase_based:
+ if self.command_profile == "phase":
self.set_up_phase_reward()
- elif self.clock_based:
+ elif self.command_profile == "clock":
self.set_up_clock_reward(dirname)
# see include/cassiemujoco.h for meaning of these indices
@@ -138,18 +104,23 @@ def __init__(self, traj='walking', simrate=50, phase_based=False, clock_based=Tr
self.vel_index = np.array([0,1,2,3,4,5,6,7,8,12,13,14,18,19,20,21,25,26,27,31])
# CONFIGURE OFFSET for No Delta Policies
- if self.aslip_traj:
- ref_pos, ref_vel = self.get_ref_state(self.phase)
- self.offset = ref_pos[self.pos_idx]
- else:
- self.offset = np.array([0.0045, 0.0, 0.4973, -1.1997, -1.5968, 0.0045, 0.0, 0.4973, -1.1997, -1.5968])
+ self.offset = np.array([0.0045, 0.0, 0.4973, -1.1997, -1.5968, 0.0045, 0.0, 0.4973, -1.1997, -1.5968])
- self.phase_add = 1
+ self.max_orient_change = 0.2
+
+ self.max_simrate = self.simrate + 10
+ self.min_simrate = self.simrate - 20
+
+ self.max_speed = 4.0
+ self.min_speed = -0.3
+
+ self.max_side_speed = 0.3
+ self.min_side_speed = -0.3
# global flat foot orientation, can be useful part of reward function:
self.neutral_foot_orient = np.array([-0.24790886454547323, -0.24679713195445646, -0.6609396704367185, 0.663921021343526])
- # tracking various variables for reward funcs
+ # various tracking variables for reward funcs
self.stepcount = 0
self.l_high = False # only true if foot is above 0.2m
self.r_high = False
@@ -174,16 +145,25 @@ def __init__(self, traj='walking', simrate=50, phase_based=False, clock_based=Tr
self.dynamics_randomization = dynamics_randomization
self.slope_rand = dynamics_randomization
self.joint_rand = dynamics_randomization
+
self.max_pitch_incline = 0.03
self.max_roll_incline = 0.03
+
self.encoder_noise = 0.01
+
self.damping_low = 0.3
self.damping_high = 5.0
+
self.mass_low = 0.5
self.mass_high = 1.5
+
self.fric_low = 0.4
self.fric_high = 1.1
+ self.speed = 0
+ self.side_speed = 0
+ self.orient_add = 0
+
# Record default dynamics parameters
self.default_damping = self.sim.get_dof_damping()
self.default_mass = self.sim.get_body_mass()
@@ -191,34 +171,20 @@ def __init__(self, traj='walking', simrate=50, phase_based=False, clock_based=Tr
self.default_fric = self.sim.get_geom_friction()
self.default_rgba = self.sim.get_geom_rgba()
self.default_quat = self.sim.get_geom_quat()
+
self.motor_encoder_noise = np.zeros(10)
self.joint_encoder_noise = np.zeros(6)
- ### Trims ###
- self.joint_offsets = np.zeros(16)
- self.com_vel_offset = 0
- self.y_offset = 0
-
- ### Random commands during training ###
- self.speed_schedule = np.zeros(4)
- self.orient_add = 0
- self.orient_time = 500
-
# Keep track of actions, torques
self.prev_action = None
self.curr_action = None
self.prev_torque = None
- # for RNN policies
- self.critic_state = None
-
self.debug = False
# Set up phase reward for dynamic phase functions
def set_up_phase_reward(self):
- self.left_clock, self.right_clock, self.phaselen = create_phase_reward(self.swing_duration, self.stance_duration, self.strict_relaxer, self.stance_mode, self.have_incentive, FREQ=2000//self.simrate)
-
if "early" in self.reward_func:
self.early_reward = True
@@ -243,11 +209,6 @@ def set_up_clock_reward(self, dirname):
self.left_clock = self.reward_clock_func["left"]
self.right_clock = self.reward_clock_func["right"]
self.reward_func = "load_clock"
- elif "aslip" in self.reward_func:
- self.reward_clock_func = load_reward_clock_funcs(os.path.join(dirname, "rewards", "reward_clock_funcs", "no_incentive_aslip_clock_strict0.3.pkl"))
- self.left_clock = self.reward_clock_func["left"]
- self.right_clock = self.reward_clock_func["right"]
- self.reward_func = "aslip_clock"
else:
if "grounded" in self.reward_func:
@@ -270,76 +231,67 @@ def set_up_clock_reward(self, dirname):
# approach = ""
self.reward_func = "clock"
- self.left_clock, self.right_clock, self.phaselen = create_phase_reward(self.swing_duration, self.stance_duration, self.strict_relaxer, self.stance_mode, self.have_incentive, FREQ=2000//self.simrate)
-
- def set_up_state_space(self):
-
- mjstate_size = 40
- state_est_size = 46
-
- speed_size = 1
+ def set_up_state_space(self, command_profile, input_profile):
- clock_size = 2
- # NOTE: phase_based includes clock also
+ full_state_est_size = 46
+ min_state_est_size = 21
+ speed_size = 2 # x speed, y speed
+ clock_size = 2 # sin, cos
phase_size = 5 # swing duration, stance duration, one-hot encoding of stance mode
- # Determine robot state portion of obs, mirr_obs
- if self.state_est:
+ # input --> FULL
+ if input_profile == "full":
base_mir_obs = np.array([0.1, 1, -2, 3, -4, -10, -11, 12, 13, 14, -5, -6, 7, 8, 9, 15, -16, 17, -18, 19, -20, -26, -27, 28, 29, 30, -21, -22, 23, 24, 25, 31, -32, 33, 37, 38, 39, 34, 35, 36, 43, 44, 45, 40, 41, 42])
- obs_size = state_est_size
+ obs_size = full_state_est_size
+ # input --> MIN
+ elif input_profile == "min":
+ base_mir_obs = np.array([
+ 3, 4, 5, # L foot relative pos
+ 0.1, 1, 2, # R foot relative pos
+ 6, -7, 8, -9, # pelvis orient (quaternion)
+ -10, 11, -12, # pelvis rot Vel
+ 17, -18, 19, -20, # L foot orient
+ 13, -14, 15, -16 # R foot orient
+ ])
+ obs_size = min_state_est_size
else:
- base_mir_obs = np.array([0.1, 1, 2, -3, 4, -5, -13, -14, 15, 16, 17, 18, 19, -6, -7, 8, 9, 10, 11, 12, 20, -21, 22, -23, 24, -25, -33, -34, 35, 36, 37, 38, 39, -26, -27, 28, 29, 30, 31, 32])
- obs_size = mjstate_size
-
- # CLOCK_BASED : clock, speed
- if self.clock_based:
+ raise NotImplementedError
+
+ # command --> CLOCK_BASED : clock, speed
+ if command_profile == "clock":
append_obs = np.array([len(base_mir_obs) + i for i in range(clock_size+speed_size)])
mirrored_obs = np.concatenate([base_mir_obs, append_obs])
clock_inds = append_obs[0:clock_size].tolist()
obs_size += clock_size + speed_size
- # PHASE_BASED : clock, phase info, speed
- elif self.phase_based:
+ # command --> PHASE_BASED : clock, phase info, speed
+ elif command_profile == "phase":
append_obs = np.array([len(base_mir_obs) + i for i in range(clock_size+phase_size+speed_size)])
mirrored_obs = np.concatenate([base_mir_obs, append_obs])
clock_inds = append_obs[0:clock_size].tolist()
obs_size += clock_size + phase_size + speed_size
- # REF TRAJ INPUT : use traj indices determined earlier
else:
- # Find the mirrored trajectory
- if self.aslip_traj:
- ref_traj_size = 18
- mirrored_traj = np.array([6,7,8,9,10,11,0.1,1,2,3,4,5,12,13,14,15,16,17])
- else:
- ref_traj_size = 40
- mirrored_traj = np.array([0.1, 1, 2, 3, 4, 5, -13, -14, 15, 16, 17, 18, 19, -6, -7, 8, 9, 10, 11, 12,
- 20, 21, 22, 23, 24, 25, -33, -34, 35, 36, 37, 38, 39, -26, -27, 28, 29, 30, 31, 32])
- mirrored_traj_sign = np.multiply(np.sign(mirrored_traj), obs_size+np.floor(np.abs(mirrored_traj)))
- mirrored_obs = np.concatenate([base_mir_obs, mirrored_traj_sign])
- clock_inds = None
- obs_size += ref_traj_size
+ raise NotImplementedError
observation_space = np.zeros(obs_size)
mirrored_obs = mirrored_obs.tolist()
- # check_arr = np.arange(obs_size, dtype=np.float64)
- # check_arr[0] = 0.1
- # print("mir obs check: ", np.all(np.sort(np.abs(mirrored_obs)) == check_arr))
- # exit()
-
return observation_space, clock_inds, mirrored_obs
- def step_simulation(self, action, learned_gains=None):
+ def rotate_to_orient(self, vec):
+ quaternion = euler2quat(z=self.orient_add, y=0, x=0)
+ iquaternion = inverse_quaternion(quaternion)
- if not self.ik_baseline:
- if self.aslip_traj and self.phase == self.phaselen - 1:
- ref_pos, ref_vel = self.get_ref_state(0)
- else:
- ref_pos, ref_vel = self.get_ref_state(self.phase + self.phase_add)
- else:
- ref_pos = self.trajectory.ik_pos[self.simsteps]
+ if len(vec) == 3:
+ return rotate_by_quaternion(vec, iquaternion)
+
+ elif len(vec) == 4:
+ new_orient = quaternion_product(iquaternion, vec)
+ if new_orient[0] < 0:
+ new_orient = -new_orient
+ return new_orient
+
+ def step_simulation(self, action, learned_gains=None):
- if not self.no_delta:
- self.offset = ref_pos[self.pos_idx]
target = action + self.offset
if self.joint_rand:
@@ -353,7 +305,7 @@ def step_simulation(self, action, learned_gains=None):
# TODO: move setting gains out of the loop?
# maybe write a wrapper for pd_in_t ?
- if self.learn_gains == False:
+ if not self.learn_gains:
self.u.leftLeg.motorPd.pGain[i] = self.P[i]
self.u.rightLeg.motorPd.pGain[i] = self.P[i]
self.u.leftLeg.motorPd.dGain[i] = self.D[i]
@@ -401,16 +353,7 @@ def step_simulation(self, action, learned_gains=None):
# Basic version of step_simulation, that only simulates forward in time, does not do any other
# computation for reward, etc. Is faster and should be used for evaluation purposes
def step_sim_basic(self, action, learned_gains=None):
- if not self.ik_baseline:
- if self.aslip_traj and self.phase == self.phaselen - 1:
- ref_pos, ref_vel = self.get_ref_state(0)
- else:
- ref_pos, ref_vel = self.get_ref_state(self.phase + self.phase_add)
- else:
- ref_pos = self.trajectory.ik_pos[self.simsteps]
- if not self.no_delta:
- self.offset = ref_pos[self.pos_idx]
target = action + self.offset
if self.joint_rand:
@@ -445,9 +388,8 @@ def step_sim_basic(self, action, learned_gains=None):
def step(self, action, return_omniscient_state=False, f_term=0):
- delay_rand = 7
if self.dynamics_randomization:
- simrate = self.simrate + np.random.randint(-delay_rand, delay_rand+1)
+ simrate = np.random.uniform(self.max_simrate, self.min_simrate)
else:
simrate = self.simrate
@@ -470,7 +412,6 @@ def step(self, action, return_omniscient_state=False, f_term=0):
self.step_simulation(action, learned_gains)
else:
self.step_simulation(action)
- self.simsteps += 1
qpos = np.copy(self.sim.qpos())
qvel = np.copy(self.sim.qvel())
# Foot Force Tracking
@@ -506,9 +447,8 @@ def step(self, action, return_omniscient_state=False, f_term=0):
self.time += 1
self.phase += self.phase_add
- if (self.aslip_traj and self.phase >= self.phaselen) or self.phase > self.phaselen:
+ if self.phase > self.phaselen:
self.last_pelvis_pos = self.sim.qpos()[0:3]
- self.simsteps = 0
self.phase = 0
self.counter += 1
@@ -540,6 +480,16 @@ def step(self, action, return_omniscient_state=False, f_term=0):
if reward < self.early_term_cutoff:
done = True
+ if np.random.randint(300) == 0: # random changes to orientation
+ self.orient_add += np.random.uniform(-self.max_orient_change, self.max_orient_change)
+
+ if np.random.randint(100) == 0: # random changes to speed
+ self.speed = np.random.uniform(self.min_speed, self.max_speed)
+ self.speed = np.clip(self.speed, self.min_speed, self.max_speed)
+
+ if np.random.randint(300) == 0: # random changes to sidespeed
+ self.side_speed = np.random.uniform(self.min_side_speed, self.max_side_speed)
+
if return_omniscient_state:
return self.get_full_state(), self.get_omniscient_state(), reward, done, {}
else:
@@ -556,14 +506,12 @@ def step_basic(self, action, return_omniscient_state=False):
self.step_sim_basic(action, learned_gains)
else:
self.step_sim_basic(action)
- self.simsteps += 1
self.time += 1
self.phase += self.phase_add
- if (self.aslip_traj and self.phase >= self.phaselen) or self.phase > self.phaselen:
+ if self.phase > self.phaselen:
self.last_pelvis_pos = self.sim.qpos()[0:3]
- self.simsteps = 0
self.phase = 0
self.counter += 1
@@ -574,26 +522,11 @@ def step_basic(self, action, return_omniscient_state=False):
def reset(self):
- if self.aslip_traj:
- self.traj_idx = random.randint(0, self.num_speeds-1)
- self.speed = self.speeds[self.traj_idx]
- # print("current speed: {}\tcurrent traj: {}".format(self.speed, random_speed_idx))
- self.trajectory = self.trajectories[self.traj_idx] # switch the current trajectory
- self.phaselen = self.trajectory.length - 1
- else:
- self.speed = (random.randint(0, 40)) / 10
- # # Make sure that if speed is above 2, freq is at least 1.2
- # if self.speed > 1.3:# or np.any(self.speed_schedule > 1.6):
- # self.phase_add = 1.3 + 0.7*random.random()
- # else:
- # self.phase_add = 1 + random.random()
-
- if self.fixed_speed != None:
- self.traj_idx = (np.abs(self.speeds - self.speed)).argmin()
- self.speed = self.fixed_speed
+ self.speed = np.random.uniform(self.min_speed, self.max_speed)
+ self.side_speed = np.random.uniform(self.min_side_speed, self.max_side_speed)
# Set up phase based
- if self.phase_based:
+ if self.command_profile == "phase":
if self.phase_input_mode == "library":
# constrain speed a bit further
self.speed = (random.randint(0, 30)) / 10
@@ -611,7 +544,7 @@ def reset(self):
self.stance_mode = np.random.choice(["grounded", "aerial", "zero"])
self.left_clock, self.right_clock, self.phaselen = create_phase_reward(self.swing_duration, self.stance_duration, self.strict_relaxer, self.stance_mode, self.have_incentive, FREQ=2000//self.simrate)
# ELSE load in clock based reward func
- elif self.reward_func == "aslip_clock" or self.reward_func == "load_clock":
+ elif self.reward_func == "load_clock":
pass
# ELSE use simple relationship to define swing and stance duration
else:
@@ -620,13 +553,11 @@ def reset(self):
self.stance_mode = "grounded"
else:
self.stance_mode = "aerial"
- total_duration = (0.9 - 0.2 / 3.0 * self.speed) / 2
- self.swing_duration = (0.375 + ((0.625 - 0.375) / 3) * self.speed) * total_duration
- self.stance_duration = (0.625 - ((0.625 - 0.375) / 3) * self.speed) * total_duration
+ total_duration = (0.9 - 0.25 / 3.0 * abs(self.speed)) / 2
+ self.swing_duration = (0.30 + ((0.70 - 0.30) / 3) * abs(self.speed)) * total_duration
+ self.stance_duration = (0.70 - ((0.70 - 0.30) / 3) * abs(self.speed)) * total_duration
self.left_clock, self.right_clock, self.phaselen = create_phase_reward(self.swing_duration, self.stance_duration, self.strict_relaxer, self.stance_mode, self.have_incentive, FREQ=2000//self.simrate)
- self.simsteps = 0
-
self.phase = random.randint(0, floor(self.phaselen))
self.time = 0
self.counter = 0
@@ -728,27 +659,15 @@ def reset(self):
# apply dynamics
self.sim.set_const()
- qpos, qvel = self.get_ref_state(self.phase)
- # orientation = random.randint(-10, 10) * np.pi / 25
- # quaternion = euler2quat(z=orientation, y=0, x=0)
- # qpos[3:7] = quaternion
- # self.y_offset = 0#random.uniform(-3.5, 3.5)
- # qpos[1] = self.y_offset
-
- if self.aslip_traj:
- qvel = np.zeros(qvel.shape)
-
- self.sim.set_qpos(qpos)
- self.sim.set_qvel(qvel)
-
self.last_pelvis_pos = self.sim.qpos()[0:3]
# Need to reset u? Or better way to reset cassie_state than taking step
self.cassie_state = self.sim.step_pd(self.u)
+ # reset commands
self.orient_add = 0 # random.randint(-10, 10) * np.pi / 25
- self.orient_time = 0 # random.randint(50, 200)
- self.com_vel_offset = 0 # 0.1*np.random.uniform(-0.1, 0.1, 2)
+ self.speed = np.random.uniform(self.min_speed, self.max_speed)
+ self.side_speed = np.random.uniform(self.min_side_speed, self.max_side_speed)
# reset mujoco tracking variables
self.l_foot_frc = 0
@@ -761,28 +680,18 @@ def reset(self):
return self.get_full_state()
def reset_for_test(self, full_reset=False):
- self.simsteps = 0
self.phase = 0
self.time = 0
self.counter = 0
self.orient_add = 0
- self.orient_time = np.inf
- self.y_offset = 0
self.phase_add = 1
self.state_history = [np.zeros(self._obs) for _ in range(self.history+1)]
- if self.aslip_traj:
- self.traj_idx = 0
- self.speed = 0
- # print("current speed: {}".format(self.speed))
- self.trajectory = self.trajectories[self.traj_idx] # switch the current trajectory
- self.phaselen = self.trajectory.length - 1
- else:
- self.speed = 0
+ self.speed = 0
# load in clock based reward func
- if self.reward_func == "aslip_clock" or self.reward_func == "load_clock":
+ if self.reward_func == "load_clock":
pass
# ELSE use simple relationship to define swing and stance duration
else:
@@ -793,9 +702,6 @@ def reset_for_test(self, full_reset=False):
self.left_clock, self.right_clock, self.phaselen = create_phase_reward(self.swing_duration, self.stance_duration, self.strict_relaxer, self.stance_mode, self.have_incentive, FREQ=2000//self.simrate)
if not full_reset:
- qpos, qvel = self.get_ref_state(self.phase)
- self.sim.set_qpos(qpos)
- self.sim.set_qvel(qvel)
# reset mujoco tracking variables
self.last_pelvis_pos = self.sim.qpos()[0:3]
@@ -842,42 +748,27 @@ def reset_cassie_state(self):
# Helper function for updating the speed, used in visualization tests
# not needed in training cause we don't change speeds in middle of rollout, and
# we randomize the starting phase of each rollout
- def update_speed(self, new_speed):
- if self.aslip_traj:
- self.traj_idx = (np.abs(self.speeds - new_speed)).argmin()
- self.speed = self.traj_idx / 10
- self.trajectory = self.trajectories[self.traj_idx]
- old_phaselen = self.phaselen
- self.phaselen = self.trajectory.length - 1
- # update phase
- self.phase = int(self.phaselen * self.phase / old_phaselen)
- # new offset
- ref_pos, ref_vel = self.get_ref_state(self.phase)
- self.offset = ref_pos[self.pos_idx]
- else:
- self.speed = new_speed
+ def update_speed(self, new_speed, new_side_speed=0.0):
+
+ self.speed = np.clip(new_speed, self.min_speed, self.max_speed)
+ self.side_speed = np.clip(new_side_speed, self.min_side_speed, self.max_side_speed)
- if self.phase_based:
+ if self.command_profile == "phase":
self.swing_duration = max(0.01, self.swing_duration)
self.stance_duration = max(0.01, self.stance_duration)
old_phaselen = self.phaselen
self.set_up_phase_reward()
self.phase = int(self.phaselen * self.phase / old_phaselen)
- elif self.reward_func == "aslip_clock":
- pass
else:
- total_duration = (0.9 - 0.2 / 3.0 * self.speed) / 2
- self.swing_duration = (0.375 + ((0.625 - 0.375) / 3) * self.speed) * total_duration
- self.stance_duration = (0.625 - ((0.625 - 0.375) / 3) * self.speed) * total_duration
+ total_duration = (0.9 - 0.25 / 3.0 * self.speed) / 2
+ self.swing_duration = (0.30 + ((0.70 - 0.30) / 3) * self.speed) * total_duration
+ self.stance_duration = (0.70 - ((0.70 - 0.30) / 3) * self.speed) * total_duration
old_phaselen = self.phaselen
self.left_clock, self.right_clock, self.phaselen = create_phase_reward(self.swing_duration, self.stance_duration, self.strict_relaxer, self.stance_mode, self.have_incentive, FREQ=2000//self.simrate)
self.phase = int(self.phaselen * self.phase / old_phaselen)
def compute_reward(self, action):
- qpos = np.copy(self.sim.qpos())
- qvel = np.copy(self.sim.qvel())
- ref_pos, ref_vel = self.get_ref_state(self.phase)
if self.reward_func == "clock" or self.reward_func == "load_clock" or self.reward_func == "switch_clock":
self.early_term_cutoff = -99.
if self.early_reward:
@@ -890,78 +781,13 @@ def compute_reward(self, action):
elif self.reward_func == "max_vel_clock":
self.early_term_cutoff = -99.
return max_vel_clock_reward(self, action)
- elif self.reward_func == "aslip_clock":
- self.early_term_cutoff = -99.
- return aslip_clock_reward(self, action)
- elif self.reward_func == "aslip_old":
- self.early_term_cutoff = 0.0
- return aslip_old_reward(self, action)
- elif self.reward_func == "iros_paper":
- return iros_paper_reward(self)
- elif self.reward_func == "5k_speed_reward":
- return old_speed_reward(self)
- elif self.reward_func == "trajmatch_footorient_hiprollvelact_reward":
- return trajmatch_footorient_hiprollvelact_reward(self)
- elif self.reward_func == "speedmatch_footorient_hiprollvelact_reward":
- return speedmatch_footorient_hiprollvelact_reward(self)
else:
raise NotImplementedError
- # get the corresponding state from the reference trajectory for the current phase
- def get_ref_state(self, phase=None):
- if phase is None:
- phase = self.phase
-
- if phase > self.phaselen:
- phase = 0
-
- # TODO: make this not so hackish
- if phase > floor(len(self.trajectory) / self.simrate) - 1:
- phase = floor((phase / self.phaselen) * len(self.trajectory) / self.simrate)
-
- desired_ind = phase * self.simrate if not self.aslip_traj else phase
- # phase_diff = desired_ind - math.floor(desired_ind)
- # if phase_diff != 0: # desired ind is an int
- # pos_prev = np.copy(self.trajectory.qpos[math.floor(desired_ind)])
- # vel_prev = np.copy(self.trajectory.qvel[math.floor(desired_ind)])
- # pos_next = np.copy(self.trajectory.qpos[math.ceil(desired_ind)])
- # vel_next = np.copy(self.trajectory.qvel[math.ceil(desired_ind)])
- # pos = pos_prev + phase_diff * (pos_next - pos_prev)
- # vel = vel_prev + phase_diff * (vel_next - vel_prev)
- # else:
- # print("desired ind: ", desired_ind)
- pos = np.copy(self.trajectory.qpos[int(desired_ind)])
- vel = np.copy(self.trajectory.qvel[int(desired_ind)])
-
- # this is just setting the x to where it "should" be given the number
- # of cycles
- # pos[0] += (self.trajectory.qpos[-1, 0] - self.trajectory.qpos[0, 0]) * self.counter
-
- # ^ should only matter for COM error calculation,
- # gets dropped out of state variable for input reasons
-
- ### Setting variable speed
- if not self.aslip_traj:
- pos[0] *= self.speed
- pos[0] += (self.trajectory.qpos[-1, 0] - self.trajectory.qpos[0, 0]) * self.counter * self.speed
- else:
- pos[0] += (self.trajectory.qpos[-1, 0] - self.trajectory.qpos[0, 0]) * self.counter
-
- # setting lateral distance target to 0?
- # regardless of reference trajectory?
- pos[1] = 0
-
- if not self.aslip_traj:
- vel[0] *= self.speed
-
- return pos, vel
-
def get_full_state(self):
qpos = np.copy(self.sim.qpos())
qvel = np.copy(self.sim.qvel())
- ref_pos, ref_vel = self.get_ref_state(self.phase + self.phase_add)
-
# TODO: maybe convert to set subtraction for clarity
# {i for i in range(35)} -
# {0, 10, 11, 12, 13, 17, 18, 19, 24, 25, 26, 27, 31, 32, 33}
@@ -975,69 +801,57 @@ def get_full_state(self):
# trajectory despite being global coord. Y is only invariant to straight
# line trajectories.
- # CLOCK BASED (NO TRAJECTORY)
- if self.phase_based:
+ # command --> PHASE_BASED : clock, phase info, speed
+ if self.command_profile == "phase":
clock = [np.sin(2 * np.pi * self.phase / self.phaselen),
np.cos(2 * np.pi * self.phase / self.phaselen)]
-
- ext_state = np.concatenate((clock, [self.swing_duration, self.stance_duration], encode_stance_mode(self.stance_mode), [self.speed]))
-
- elif self.clock_based:
+ ext_state = np.concatenate((clock, [self.swing_duration, self.stance_duration], encode_stance_mode(self.stance_mode), [self.speed, self.side_speed]))
+ # command --> CLOCK_BASED : clock, speed
+ elif self.command_profile == "clock":
clock = [np.sin(2 * np.pi * self.phase / self.phaselen),
np.cos(2 * np.pi * self.phase / self.phaselen)]
- ext_state = np.concatenate((clock, [self.speed]))
-
- # ASLIP TRAJECTORY
- elif self.aslip_traj and not self.clock_based:
- if(self.phase == 0):
- # ext_state = np.concatenate(get_ref_aslip_ext_state(self, self.cassie_state, self.last_pelvis_pos, self.phaselen - 1, offset=self.vertOffset))
- ext_state = np.concatenate(get_ref_aslip_unaltered_state(self, self.phaselen - 1))
- else:
- # ext_state = np.concatenate(get_ref_aslip_ext_state(self, self.cassie_state, self.last_pelvis_pos, self.phaselen - 1, offset=self.vertOffset))
- ext_state = np.concatenate(get_ref_aslip_unaltered_state(self, self.phase))
-
- # OTHER TRAJECTORY
+ ext_state = np.concatenate((clock, [self.speed, self.side_speed]))
else:
- ext_state = np.concatenate([ref_pos[self.pos_index], ref_vel[self.vel_index]])
+ raise NotImplementedError
# Update orientation
- new_orient = self.cassie_state.pelvis.orientation[:]
- new_translationalVelocity = self.cassie_state.pelvis.translationalVelocity[:]
- new_translationalAcceleleration = self.cassie_state.pelvis.translationalAcceleration[:]
- # new_translationalVelocity[0:2] += self.com_vel_offset
- quaternion = euler2quat(z=self.orient_add, y=0, x=0)
- iquaternion = inverse_quaternion(quaternion)
- new_orient = quaternion_product(iquaternion, self.cassie_state.pelvis.orientation[:])
- if new_orient[0] < 0:
- new_orient = -new_orient
- new_translationalVelocity = rotate_by_quaternion(self.cassie_state.pelvis.translationalVelocity[:], iquaternion)
- new_translationalAcceleleration = rotate_by_quaternion(self.cassie_state.pelvis.translationalAcceleration[:], iquaternion)
- motor_pos = self.cassie_state.motor.position[:]
- joint_pos = self.cassie_state.joint.position[:]
+ new_orient = self.rotate_to_orient(self.cassie_state.pelvis.orientation[:])
+ new_translationalVelocity = self.rotate_to_orient(self.cassie_state.pelvis.translationalVelocity[:])
+ new_translationalAcceleleration = self.rotate_to_orient(self.cassie_state.pelvis.translationalAcceleration[:])
+
+ # motor and joint poses
if self.joint_rand:
- motor_pos += self.joint_offsets[0:10]
- joint_pos += self.joint_offsets[10:16]
-
- # Use state estimator
- robot_state = np.concatenate([
- [self.cassie_state.pelvis.position[2] - self.cassie_state.terrain.height], # pelvis height
- new_orient, # pelvis orientation
- motor_pos, # actuated joint positions
-
- new_translationalVelocity, # pelvis translational velocity
- self.cassie_state.pelvis.rotationalVelocity[:], # pelvis rotational velocity
- self.cassie_state.motor.velocity[:], # actuated joint velocities
-
- new_translationalAcceleleration, # pelvis translational acceleration
-
- joint_pos, # unactuated joint positions
- self.cassie_state.joint.velocity[:] # unactuated joint velocities
- ])
-
- if self.state_est:
- state = np.concatenate([robot_state, ext_state])
+ motor_pos = self.cassie_state.motor.position[:] + self.motor_encoder_noise
+ joint_pos = self.cassie_state.joint.position[:] + self.joint_encoder_noise
+ else:
+ motor_pos = self.cassie_state.motor.position[:]
+ joint_pos = self.cassie_state.joint.position[:]
+
+ if self.input_profile == "min":
+ robot_state = np.concatenate([
+ self.cassie_state.leftFoot.position[:], # left foot position
+ self.cassie_state.rightFoot.position[:], # right foot position
+ new_orient, # pelvis orientation
+ self.cassie_state.pelvis.rotationalVelocity[:], # pelvis rotational velocity
+ self.cassie_state.leftFoot.orientation[:], # left foot orientation
+ self.cassie_state.rightFoot.orientation[:] # right foot orientation
+ ])
+ elif self.input_profile == "full":
+ robot_state = np.concatenate([
+ [self.cassie_state.pelvis.position[2] - self.cassie_state.terrain.height], # pelvis height
+ new_orient, # pelvis orientation
+ motor_pos, # actuated joint positions
+ new_translationalVelocity, # pelvis translational velocity
+ self.cassie_state.pelvis.rotationalVelocity[:], # pelvis rotational velocity
+ self.cassie_state.motor.velocity[:], # actuated joint velocities
+ new_translationalAcceleleration, # pelvis translational acceleration
+ joint_pos, # unactuated joint positions
+ self.cassie_state.joint.velocity[:] # unactuated joint velocities
+ ])
else:
- state = np.concatenate([qpos[self.pos_index], qvel[self.vel_index], ext_state])
+ raise NotImplementedError
+
+ state = np.concatenate([robot_state, ext_state])
self.state_history.insert(0, state)
self.state_history = self.state_history[:self.history+1]
@@ -1050,44 +864,6 @@ def render(self):
return self.vis.draw(self.sim)
- def get_state_info(self):
- # state estimator info
- pelvis_pos = self.cassie_state.pelvis.position[:]
- pelvis_pos[2] = pelvis_pos[2] - self.cassie_state.terrain.height
- lf_pos = self.cassie_state.leftFoot.position[:]
- rf_pos = self.cassie_state.rightFoot.position[:]
- lf_pos_global = [pelvis_pos[i] + lf_pos[i] for i in range(3)]
- rf_pos_global = [pelvis_pos[i] + rf_pos[i] for i in range(3)]
- # robot_state_info = np.array([pelvis_pos, lf_pos, rf_pos])
- robot_state_info = np.array([pelvis_pos, lf_pos_global, rf_pos_global])
-
- # mujoco info
- qpos = self.sim.qpos()
- actual_compos = qpos[0:3]
- actual_lf = self.l_foot_pos
- actual_rf = self.r_foot_pos
- # actual_lf = self.l_foot_pos - qpos[0:3]
- # actual_rf = self.r_foot_pos - qpos[0:3]
- actual_state_info = np.array([actual_compos, actual_lf, actual_rf])
-
- return robot_state_info, actual_state_info
-
- # This is assumed to be called after env.step. hence, using self.phase - 1
- def get_traj_and_state_info(self):
- # traj info used in rewards
- traj_info = get_ref_aslip_global_state(self, self.phase-1)
- # traj_info = get_ref_aslip_unaltered_state(self, self.phase)
- traj_info = [traj_info[4], traj_info[2], traj_info[0]]
-
- # traj info going into the policy
- # traj_cmd_info = get_ref_aslip_ext_state(self, self.cassie_state, self.last_pelvis_pos, self.phase, offset=self.vertOffset)
- traj_cmd_info = get_ref_aslip_unaltered_state(self, self.phase-1)
- traj_cmd_info = [traj_cmd_info[4], traj_cmd_info[2], traj_cmd_info[0]]
-
- robot_state_info, actual_state_info = self.get_state_info()
-
- return traj_info, traj_cmd_info, robot_state_info, actual_state_info
-
# Currently unused
# def get_omniscient_state(self):
diff --git a/cassie/cassie_min.py b/cassie/cassie_traj.py
similarity index 84%
rename from cassie/cassie_min.py
rename to cassie/cassie_traj.py
index 4c1dde27..428f6254 100644
--- a/cassie/cassie_min.py
+++ b/cassie/cassie_traj.py
@@ -9,7 +9,7 @@
from math import floor
-import numpy as np
+import numpy as np
import os
import random
import copy
@@ -25,39 +25,40 @@ def load_reward_clock_funcs(path):
return clock_funcs
-class CassieEnv_v3:
- def __init__(self, traj='walking', simrate=50, phase_based=False, clock_based=True, state_est=True, dynamics_randomization=True,
- no_delta=True, learn_gains=False, ik_baseline=False, reward="iros_paper",
+class CassieTrajEnv:
+ def __init__(self, traj='walking', simrate=50, command_profile="clock", input_profile="full", dynamics_randomization=True,
+ learn_gains=False, reward="iros_paper",
+ no_delta=True, ik_baseline=False,
config="./cassie/cassiemujoco/cassie.xml", history=0, **kwargs):
dirname = os.path.dirname(__file__)
- #xml_path = os.path.join(dirname, "cassiemujoco", "cassie.xml")
- #self.sim = CassieSim(xml_path)
self.config = config
self.sim = CassieSim(self.config)
# self.sim = CassieSim("./cassie/cassiemujoco/cassie_drop_step.xml")
self.vis = None
+ # legacy env attributes clock_based and phase_based
+ if command_profile == "clock":
+ self.clock_based, self.phase_based = True, False
+ elif command_profile == "phase":
+ self.clock_based, self.phase_based = False, True
+ elif command_profile == "traj":
+ self.clock_based, self.phase_based = False, False
+ if input_profile == "full":
+ self.min_input = False
+ elif input_profile == "min":
+ self.min_input = True
+
# Arguments for the simulation and state space
- self.phase_based = phase_based
- self.clock_based = clock_based
- self.state_est = state_est
- self.no_delta = no_delta
self.dynamics_randomization = dynamics_randomization
+ self.no_delta = no_delta
self.ik_baseline = ik_baseline
- assert(not (phase_based and clock_based))
-
- # Arguments generally used for curriculum training
+ # Arguments for reward function
self.reward_func = reward
self.early_term_cutoff = 0.3
- curriculum_defaults = {
- "fixed_speed" : None
- }
- for (arg, default) in curriculum_defaults.items():
- setattr(self, arg, kwargs.get(arg, default))
- # CONFIGURE REF TRAJECTORY to use
+ # CONFIGURE REF TRAJECTORY
if traj == "aslip":
self.speeds = np.array([x / 10 for x in range(0, 21)])
self.trajectories = getAllTrajectories(self.speeds)
@@ -88,42 +89,38 @@ def __init__(self, traj='walking', simrate=50, phase_based=False, clock_based=Tr
# learn gains means there is a delta on the default PD gains ***FOR EACH LEG***
self.learn_gains = learn_gains
- if self.learn_gains == True:
+ if self.learn_gains:
self.action_space = np.zeros(10 + 20)
+ self.mirrored_acts = [-5, -6, 7, 8, 9, -0.1, -1, 2, 3, 4,
+ -15, -16, 17, 18, 19, -10, -11, 12, 13, 14,
+ -25, -26, 27, 28, 29, -20, -21, 22, 23, 24]
else:
self.action_space = np.zeros(10)
+ self.mirrored_acts = [-5, -6, 7, 8, 9, -0.1, -1, 2, 3, 4]
self.u = pd_in_t()
# TODO: should probably initialize this to current state
self.cassie_state = state_out_t()
-
- self.simrate = simrate # simulate X mujoco steps with same pd target
- # 50 brings simulation from 2000Hz to exactly 40Hz
-
- self.simsteps = 0 # number of simulation steps
-
- self.time = 0 # number of time steps in current episode
- self.phase = 0 # portion of the phase the robot is in
- self.counter = 0 # number of phase cycles completed in episode
+ self.simrate = simrate # simulate X mujoco steps with same pd target. 50 brings simulation from 2000Hz to exactly 40Hz
+ self.simsteps = 0 # tracks number of mujoco simulation steps
+ self.time = 0 # number of time steps in current episode
+ self.phase = 0 # portion of the phase the robot is in
+ self.counter = 0 # number of phase cycles completed in episode
# NOTE: a reference trajectory represents ONE phase cycle
- # phase function variables
- self.swing_duration = 0.15
- self.stance_duration = 0.25
- self.stance_mode = "zero"
- # constants
- self.strict_relaxer = 0.1
- self.have_incentive = False if "no_incentive" in self.reward_func else True
+ # should be floor(len(traj) / simrate) - 1
+ # should be VERY cautious here because wrapping around trajectory
+ # badly can cause assymetrical/bad gaits
+ self.phaselen = floor(len(self.trajectory) / self.simrate) - 1 if not self.aslip_traj else self.trajectory.length - 1
+ self.phase_add = 1
- if not self.phase_based:
- # should be floor(len(traj) / simrate) - 1
- # should be VERY cautious here because wrapping around trajectory
- # badly can cause assymetrical/bad gaits
- self.phaselen = floor(len(self.trajectory) / self.simrate) - 1 if not self.aslip_traj else self.trajectory.length - 1
+ # NOTE: phase_based modifies self.phaselen throughout training
# Set up phase based / load in clock based reward func
+ self.strict_relaxer = 0.1
+ self.have_incentive = False if "no_incentive" in self.reward_func else True
self.early_reward = False
if self.phase_based:
self.set_up_phase_reward()
@@ -144,12 +141,21 @@ def __init__(self, traj='walking', simrate=50, phase_based=False, clock_based=Tr
else:
self.offset = np.array([0.0045, 0.0, 0.4973, -1.1997, -1.5968, 0.0045, 0.0, 0.4973, -1.1997, -1.5968])
- self.phase_add = 1
+ self.max_orient_change = 0.2
+
+ self.max_simrate = self.simrate + 10
+ self.min_simrate = self.simrate - 20
+
+ self.max_speed = 4.0
+ self.min_speed = -0.3
+
+ self.max_side_speed = 0.3
+ self.min_side_speed = -0.3
# global flat foot orientation, can be useful part of reward function:
self.neutral_foot_orient = np.array([-0.24790886454547323, -0.24679713195445646, -0.6609396704367185, 0.663921021343526])
- # tracking various variables for reward funcs
+ # various tracking variables for reward funcs
self.stepcount = 0
self.l_high = False # only true if foot is above 0.2m
self.r_high = False
@@ -174,16 +180,25 @@ def __init__(self, traj='walking', simrate=50, phase_based=False, clock_based=Tr
self.dynamics_randomization = dynamics_randomization
self.slope_rand = dynamics_randomization
self.joint_rand = dynamics_randomization
+
self.max_pitch_incline = 0.03
self.max_roll_incline = 0.03
+
self.encoder_noise = 0.01
+
self.damping_low = 0.3
self.damping_high = 5.0
+
self.mass_low = 0.5
self.mass_high = 1.5
+
self.fric_low = 0.4
self.fric_high = 1.1
+ self.speed = 0
+ self.side_speed = 0
+ self.orient_add = 0
+
# Record default dynamics parameters
self.default_damping = self.sim.get_dof_damping()
self.default_mass = self.sim.get_body_mass()
@@ -191,34 +206,20 @@ def __init__(self, traj='walking', simrate=50, phase_based=False, clock_based=Tr
self.default_fric = self.sim.get_geom_friction()
self.default_rgba = self.sim.get_geom_rgba()
self.default_quat = self.sim.get_geom_quat()
+
self.motor_encoder_noise = np.zeros(10)
self.joint_encoder_noise = np.zeros(6)
- ### Trims ###
- self.joint_offsets = np.zeros(16)
- self.com_vel_offset = 0
- self.y_offset = 0
-
- ### Random commands during training ###
- self.speed_schedule = np.zeros(4)
- self.orient_add = 0
- self.orient_time = 500
-
# Keep track of actions, torques
self.prev_action = None
self.curr_action = None
self.prev_torque = None
- # for RNN policies
- self.critic_state = None
-
self.debug = False
# Set up phase reward for dynamic phase functions
def set_up_phase_reward(self):
- self.left_clock, self.right_clock, self.phaselen = create_phase_reward(self.swing_duration, self.stance_duration, self.strict_relaxer, self.stance_mode, self.have_incentive, FREQ=2000//self.simrate)
-
if "early" in self.reward_func:
self.early_reward = True
@@ -270,44 +271,46 @@ def set_up_clock_reward(self, dirname):
# approach = ""
self.reward_func = "clock"
- self.left_clock, self.right_clock, self.phaselen = create_phase_reward(self.swing_duration, self.stance_duration, self.strict_relaxer, self.stance_mode, self.have_incentive, FREQ=2000//self.simrate)
-
def set_up_state_space(self):
- mjstate_size = 40
- state_est_size = 21
-
- speed_size = 1
-
- clock_size = 2
- # NOTE: phase_based includes clock also
+ full_state_est_size = 46
+ min_state_est_size = 21
+ speed_size = 2 # x speed, y speed
+ clock_size = 2 # sin, cos
phase_size = 5 # swing duration, stance duration, one-hot encoding of stance mode
- # Determine robot state portion of obs, mirr_obs
- if self.state_est:
+ # input --> FULL
+ if not self.min_input:
+ base_mir_obs = np.array([0.1, 1, -2, 3, -4, -10, -11, 12, 13, 14, -5, -6, 7, 8, 9, 15, -16, 17, -18, 19, -20, -26, -27, 28, 29, 30, -21, -22, 23, 24, 25, 31, -32, 33, 37, 38, 39, 34, 35, 36, 43, 44, 45, 40, 41, 42])
+ obs_size = full_state_est_size
+
+ # input --> MIN
+ else:
base_mir_obs = np.array([
- 3, 4, 5, 0.1, 1, 2, 6, -7, 8, -9, # L and R foot, orient
- -10, 11, -12, # Rot Vel
- 17, -18, 19, -20, 13, -14, 15, -16
+ 3, 4, 5, # L foot relative pos
+ 0.1, 1, 2, # R foot relative pos
+ 6, -7, 8, -9, # pelvis orient (quaternion)
+ -10, 11, -12, # pelvis rot Vel
+ 17, -18, 19, -20, # L foot orient
+ 13, -14, 15, -16 # R foot orient
])
- obs_size = state_est_size
- else:
- base_mir_obs = np.array([0.1, 1, 2, -3, 4, -5, -13, -14, 15, 16, 17, 18, 19, -6, -7, 8, 9, 10, 11, 12, 20, -21, 22, -23, 24, -25, -33, -34, 35, 36, 37, 38, 39, -26, -27, 28, 29, 30, 31, 32])
- obs_size = mjstate_size
+ obs_size = min_state_est_size
- # CLOCK_BASED : clock, speed
+ # command --> CLOCK_BASED : clock, speed
if self.clock_based:
append_obs = np.array([len(base_mir_obs) + i for i in range(clock_size+speed_size)])
mirrored_obs = np.concatenate([base_mir_obs, append_obs])
clock_inds = append_obs[0:clock_size].tolist()
obs_size += clock_size + speed_size
- # PHASE_BASED : clock, phase info, speed
+
+ # command --> PHASE_BASED : clock, phase info, speed
elif self.phase_based:
append_obs = np.array([len(base_mir_obs) + i for i in range(clock_size+phase_size+speed_size)])
mirrored_obs = np.concatenate([base_mir_obs, append_obs])
clock_inds = append_obs[0:clock_size].tolist()
obs_size += clock_size + phase_size + speed_size
- # REF TRAJ INPUT : use traj indices determined earlier
+
+ # command --> REF_TRAJ_BASED : single-timestep trajectory
else:
# Find the mirrored trajectory
if self.aslip_traj:
@@ -325,13 +328,21 @@ def set_up_state_space(self):
observation_space = np.zeros(obs_size)
mirrored_obs = mirrored_obs.tolist()
- # check_arr = np.arange(obs_size, dtype=np.float64)
- # check_arr[0] = 0.1
- # print("mir obs check: ", np.all(np.sort(np.abs(mirrored_obs)) == check_arr))
- # exit()
-
return observation_space, clock_inds, mirrored_obs
+ def rotate_to_orient(self, vec):
+ quaternion = euler2quat(z=self.orient_add, y=0, x=0)
+ iquaternion = inverse_quaternion(quaternion)
+
+ if len(vec) == 3:
+ return rotate_by_quaternion(vec, iquaternion)
+
+ elif len(vec) == 4:
+ new_orient = quaternion_product(iquaternion, vec)
+ if new_orient[0] < 0:
+ new_orient = -new_orient
+ return new_orient
+
def step_simulation(self, action, learned_gains=None):
if not self.ik_baseline:
@@ -357,7 +368,7 @@ def step_simulation(self, action, learned_gains=None):
# TODO: move setting gains out of the loop?
# maybe write a wrapper for pd_in_t ?
- if self.learn_gains == False:
+ if not self.learn_gains:
self.u.leftLeg.motorPd.pGain[i] = self.P[i]
self.u.rightLeg.motorPd.pGain[i] = self.P[i]
self.u.leftLeg.motorPd.dGain[i] = self.D[i]
@@ -449,9 +460,8 @@ def step_sim_basic(self, action, learned_gains=None):
def step(self, action, return_omniscient_state=False, f_term=0):
- delay_rand = 7
if self.dynamics_randomization:
- simrate = self.simrate + np.random.randint(-delay_rand, delay_rand+1)
+ simrate = np.random.uniform(self.max_simrate, self.min_simrate)
else:
simrate = self.simrate
@@ -544,6 +554,16 @@ def step(self, action, return_omniscient_state=False, f_term=0):
if reward < self.early_term_cutoff:
done = True
+ if np.random.randint(300) == 0: # random changes to orientation
+ self.orient_add += np.random.uniform(-self.max_orient_change, self.max_orient_change)
+
+ if np.random.randint(100) == 0: # random changes to speed
+ self.speed = np.random.uniform(self.min_speed, self.max_speed)
+ self.speed = np.clip(self.speed, self.min_speed, self.max_speed)
+
+ if np.random.randint(300) == 0: # random changes to sidespeed
+ self.side_speed = np.random.uniform(self.min_side_speed, self.max_side_speed)
+
if return_omniscient_state:
return self.get_full_state(), self.get_omniscient_state(), reward, done, {}
else:
@@ -586,16 +606,13 @@ def reset(self):
self.phaselen = self.trajectory.length - 1
else:
self.speed = (random.randint(0, 40)) / 10
+ self.side_speed = np.random.uniform(self.min_side_speed, self.max_side_speed)
# # Make sure that if speed is above 2, freq is at least 1.2
# if self.speed > 1.3:# or np.any(self.speed_schedule > 1.6):
# self.phase_add = 1.3 + 0.7*random.random()
# else:
# self.phase_add = 1 + random.random()
- if self.fixed_speed != None:
- self.traj_idx = (np.abs(self.speeds - self.speed)).argmin()
- self.speed = self.fixed_speed
-
# Set up phase based
if self.phase_based:
if self.phase_input_mode == "library":
@@ -624,9 +641,9 @@ def reset(self):
self.stance_mode = "grounded"
else:
self.stance_mode = "aerial"
- total_duration = (0.9 - 0.2 / 3.0 * self.speed) / 2
- self.swing_duration = (0.375 + ((0.625 - 0.375) / 3) * self.speed) * total_duration
- self.stance_duration = (0.625 - ((0.625 - 0.375) / 3) * self.speed) * total_duration
+ total_duration = (0.9 - 0.25 / 3.0 * abs(self.speed)) / 2
+ self.swing_duration = (0.30 + ((0.70 - 0.30) / 3) * abs(self.speed)) * total_duration
+ self.stance_duration = (0.70 - ((0.70 - 0.30) / 3) * abs(self.speed)) * total_duration
self.left_clock, self.right_clock, self.phaselen = create_phase_reward(self.swing_duration, self.stance_duration, self.strict_relaxer, self.stance_mode, self.have_incentive, FREQ=2000//self.simrate)
self.simsteps = 0
@@ -733,11 +750,6 @@ def reset(self):
self.sim.set_const()
qpos, qvel = self.get_ref_state(self.phase)
- # orientation = random.randint(-10, 10) * np.pi / 25
- # quaternion = euler2quat(z=orientation, y=0, x=0)
- # qpos[3:7] = quaternion
- # self.y_offset = 0#random.uniform(-3.5, 3.5)
- # qpos[1] = self.y_offset
if self.aslip_traj:
qvel = np.zeros(qvel.shape)
@@ -750,9 +762,10 @@ def reset(self):
# Need to reset u? Or better way to reset cassie_state than taking step
self.cassie_state = self.sim.step_pd(self.u)
+ # reset commands
self.orient_add = 0 # random.randint(-10, 10) * np.pi / 25
- self.orient_time = 0 # random.randint(50, 200)
- self.com_vel_offset = 0 # 0.1*np.random.uniform(-0.1, 0.1, 2)
+ self.speed = np.random.uniform(self.min_speed, self.max_speed)
+ self.side_speed = np.random.uniform(self.min_side_speed, self.max_side_speed)
# reset mujoco tracking variables
self.l_foot_frc = 0
@@ -770,8 +783,6 @@ def reset_for_test(self, full_reset=False):
self.time = 0
self.counter = 0
self.orient_add = 0
- self.orient_time = np.inf
- self.y_offset = 0
self.phase_add = 1
self.state_history = [np.zeros(self._obs) for _ in range(self.history+1)]
@@ -846,7 +857,8 @@ def reset_cassie_state(self):
# Helper function for updating the speed, used in visualization tests
# not needed in training cause we don't change speeds in middle of rollout, and
# we randomize the starting phase of each rollout
- def update_speed(self, new_speed):
+ def update_speed(self, new_speed, new_side_speed=0.0):
+
if self.aslip_traj:
self.traj_idx = (np.abs(self.speeds - new_speed)).argmin()
self.speed = self.traj_idx / 10
@@ -859,8 +871,10 @@ def update_speed(self, new_speed):
ref_pos, ref_vel = self.get_ref_state(self.phase)
self.offset = ref_pos[self.pos_idx]
else:
- self.speed = new_speed
+ self.speed = np.clip(new_speed, self.min_speed, self.max_speed)
+ self.side_speed = np.clip(new_side_speed, self.min_side_speed, self.max_side_speed)
+
if self.phase_based:
self.swing_duration = max(0.01, self.swing_duration)
self.stance_duration = max(0.01, self.stance_duration)
@@ -870,18 +884,15 @@ def update_speed(self, new_speed):
elif self.reward_func == "aslip_clock":
pass
else:
- total_duration = (0.9 - 0.2 / 3.0 * self.speed) / 2
- self.swing_duration = (0.375 + ((0.625 - 0.375) / 3) * self.speed) * total_duration
- self.stance_duration = (0.625 - ((0.625 - 0.375) / 3) * self.speed) * total_duration
+ total_duration = (0.9 - 0.25 / 3.0 * self.speed) / 2
+ self.swing_duration = (0.30 + ((0.70 - 0.30) / 3) * self.speed) * total_duration
+ self.stance_duration = (0.70 - ((0.70 - 0.30) / 3) * self.speed) * total_duration
old_phaselen = self.phaselen
self.left_clock, self.right_clock, self.phaselen = create_phase_reward(self.swing_duration, self.stance_duration, self.strict_relaxer, self.stance_mode, self.have_incentive, FREQ=2000//self.simrate)
self.phase = int(self.phaselen * self.phase / old_phaselen)
def compute_reward(self, action):
- qpos = np.copy(self.sim.qpos())
- qvel = np.copy(self.sim.qvel())
- ref_pos, ref_vel = self.get_ref_state(self.phase)
if self.reward_func == "clock" or self.reward_func == "load_clock" or self.reward_func == "switch_clock":
self.early_term_cutoff = -99.
if self.early_reward:
@@ -979,19 +990,17 @@ def get_full_state(self):
# trajectory despite being global coord. Y is only invariant to straight
# line trajectories.
- # CLOCK BASED (NO TRAJECTORY)
+ # command --> PHASE_BASED : clock, phase info, speed
if self.phase_based:
clock = [np.sin(2 * np.pi * self.phase / self.phaselen),
np.cos(2 * np.pi * self.phase / self.phaselen)]
-
- ext_state = np.concatenate((clock, [self.swing_duration, self.stance_duration], encode_stance_mode(self.stance_mode), [self.speed]))
-
+ ext_state = np.concatenate((clock, [self.swing_duration, self.stance_duration], encode_stance_mode(self.stance_mode), [self.speed, self.side_speed]))
+ # command --> CLOCK_BASED : clock, speed
elif self.clock_based:
clock = [np.sin(2 * np.pi * self.phase / self.phaselen),
np.cos(2 * np.pi * self.phase / self.phaselen)]
- ext_state = np.concatenate((clock, [self.speed]))
-
- # ASLIP TRAJECTORY
+ ext_state = np.concatenate((clock, [self.speed, self.side_speed]))
+ # command --> REF_TRAJ_BASED : aslip trajectory
elif self.aslip_traj and not self.clock_based:
if(self.phase == 0):
# ext_state = np.concatenate(get_ref_aslip_ext_state(self, self.cassie_state, self.last_pelvis_pos, self.phaselen - 1, offset=self.vertOffset))
@@ -999,45 +1008,46 @@ def get_full_state(self):
else:
# ext_state = np.concatenate(get_ref_aslip_ext_state(self, self.cassie_state, self.last_pelvis_pos, self.phaselen - 1, offset=self.vertOffset))
ext_state = np.concatenate(get_ref_aslip_unaltered_state(self, self.phase))
-
- # OTHER TRAJECTORY
+ # command --> REF_TRAJ_BASED : agility trajectory
else:
ext_state = np.concatenate([ref_pos[self.pos_index], ref_vel[self.vel_index]])
# Update orientation
- new_orient = self.cassie_state.pelvis.orientation[:]
- new_translationalVelocity = self.cassie_state.pelvis.translationalVelocity[:]
- new_translationalAcceleleration = self.cassie_state.pelvis.translationalAcceleration[:]
- # new_translationalVelocity[0:2] += self.com_vel_offset
- quaternion = euler2quat(z=self.orient_add, y=0, x=0)
- iquaternion = inverse_quaternion(quaternion)
- new_orient = quaternion_product(iquaternion, self.cassie_state.pelvis.orientation[:])
- if new_orient[0] < 0:
- new_orient = -new_orient
- new_translationalVelocity = rotate_by_quaternion(self.cassie_state.pelvis.translationalVelocity[:], iquaternion)
- new_translationalAcceleleration = rotate_by_quaternion(self.cassie_state.pelvis.translationalAcceleration[:], iquaternion)
- motor_pos = self.cassie_state.motor.position[:]
- joint_pos = self.cassie_state.joint.position[:]
- # if self.joint_rand:
- # motor_pos += self.joint_offsets[0:10]
- # joint_pos += self.joint_offsets[10:16]
-
- # Use state estimator
- robot_state = np.concatenate([
- self.cassie_state.leftFoot.position[:], # left foot position
- self.cassie_state.rightFoot.position[:], # right foot position
- new_orient, # pelvis orientation
-
- self.cassie_state.pelvis.rotationalVelocity[:], # pelvis rotational velocity
-
- self.cassie_state.leftFoot.orientation[:], # unactuated joint positions
- self.cassie_state.rightFoot.orientation[:] # unactuated joint velocities
- ])
-
- if self.state_est:
- state = np.concatenate([robot_state, ext_state])
+ new_orient = self.rotate_to_orient(self.cassie_state.pelvis.orientation[:])
+ new_translationalVelocity = self.rotate_to_orient(self.cassie_state.pelvis.translationalVelocity[:])
+ new_translationalAcceleleration = self.rotate_to_orient(self.cassie_state.pelvis.translationalAcceleration[:])
+
+ # motor and joint poses
+ if self.joint_rand:
+ motor_pos = self.cassie_state.motor.position[:] + self.motor_encoder_noise
+ joint_pos = self.cassie_state.joint.position[:] + self.joint_encoder_noise
else:
- state = np.concatenate([qpos[self.pos_index], qvel[self.vel_index], ext_state])
+ motor_pos = self.cassie_state.motor.position[:]
+ joint_pos = self.cassie_state.joint.position[:]
+
+ if self.min_input:
+ robot_state = np.concatenate([
+ self.cassie_state.leftFoot.position[:], # left foot position
+ self.cassie_state.rightFoot.position[:], # right foot position
+ new_orient, # pelvis orientation
+ self.cassie_state.pelvis.rotationalVelocity[:], # pelvis rotational velocity
+ self.cassie_state.leftFoot.orientation[:], # left foot orientation
+ self.cassie_state.rightFoot.orientation[:] # right foot orientation
+ ])
+ else:
+ robot_state = np.concatenate([
+ [self.cassie_state.pelvis.position[2] - self.cassie_state.terrain.height], # pelvis height
+ new_orient, # pelvis orientation
+ motor_pos, # actuated joint positions
+ new_translationalVelocity, # pelvis translational velocity
+ self.cassie_state.pelvis.rotationalVelocity[:], # pelvis rotational velocity
+ self.cassie_state.motor.velocity[:], # actuated joint velocities
+ new_translationalAcceleleration, # pelvis translational acceleration
+ joint_pos, # unactuated joint positions
+ self.cassie_state.joint.velocity[:] # unactuated joint velocities
+ ])
+
+ state = np.concatenate([robot_state, ext_state])
self.state_history.insert(0, state)
self.state_history = self.state_history[:self.history+1]
diff --git a/deprecated/RL_controller_aslip.py b/deprecated/RL_controller_aslip.py
deleted file mode 100644
index 33299b16..00000000
--- a/deprecated/RL_controller_aslip.py
+++ /dev/null
@@ -1,358 +0,0 @@
-
-from cassie.cassiemujoco.cassieUDP import *
-from cassie.cassiemujoco.cassiemujoco_ctypes import *
-# from cassie.speed_env import CassieEnv_speed
-# from cassie.speed_double_freq_env import CassieEnv_speed_dfreq
-# from cassie.speed_no_delta_env import CassieEnv_speed_no_delta
-# from cassie.speed_no_delta_neutral_foot_env import CassieEnv_speed_no_delta_neutral_foot
-
-import time
-import numpy as np
-import torch
-import pickle
-import platform
-from cassie.quaternion_function import *
-
-#import signal
-import atexit
-import sys
-import select
-import tty
-import termios
-
-"""
-We need to include the trajectory library for the right offset information, as well as the right phaselen and speed
-"""
-
-def getAllTrajectories(speeds):
- trajectories = []
-
- for i, speed in enumerate(speeds):
- dirname = os.path.dirname(__file__)
- traj_path = os.path.join(dirname, "cassie", "trajectory", "aslipTrajsSweep", "walkCycle_{}.pkl".format(speed))
- trajectories.append(CassieIKTrajectory(traj_path))
-
- # print("Got all trajectories")
- return trajectories
-
-class CassieIKTrajectory:
- def __init__(self, filepath):
- with open(filepath, "rb") as f:
- trajectory = pickle.load(f)
-
- self.qpos = np.copy(trajectory["qpos"])
- self.length = self.qpos.shape[0]
- self.qvel = np.copy(trajectory["qvel"])
- self.rfoot = np.copy(trajectory["rfoot"])
- self.lfoot = np.copy(trajectory["lfoot"])
-
-# simrate used to be 60
-class TrajectoryInfo:
- def __init__(self):
-
- self.freq_adjust = 1
-
- self.speeds = [x / 10 for x in range(0, 21)]
- self.trajectories = getAllTrajectories(self.speeds)
- self.num_speeds = len(self.trajectories)
-
- self.time = 0 # number of time steps in current episode
- self.phase = 0 # portion of the phase the robot is in
- self.counter = 0 # number of phase cycles completed in episode
-
- # NOTE: each trajectory in trajectories should have the same length
- self.speed = self.speeds[0]
- self.trajectory = self.trajectories[0]
-
- # NOTE: a reference trajectory represents ONE phase cycle
-
- # should be floor(len(traj) / simrate) - 1
- # should be VERY cautious here because wrapping around trajectory
- # badly can cause assymetrical/bad gaits
- # self.phaselen = floor(self.trajectory.length / self.simrate) - 1
- self.phaselen = self.trajectory.length - 1
-
- # see include/cassiemujoco.h for meaning of these indices
- self.pos_idx = [7, 8, 9, 14, 20, 21, 22, 23, 28, 34]
- self.vel_idx = [6, 7, 8, 12, 18, 19, 20, 21, 25, 31]
-
- # maybe make ref traj only send relevant idxs?
- ref_pos, ref_vel = self.get_ref_state(self.phase)
- self.offset = ref_pos[self.pos_idx]
- self.phase_add = 1
-
- # get the corresponding state from the reference trajectory for the current phase
- def get_ref_state(self, phase=None):
- if phase is None:
- phase = self.phase
-
- if phase > self.phaselen:
- phase = 0
-
- # pos = np.copy(self.trajectory.qpos[phase * self.simrate])
- pos = np.copy(self.trajectory.qpos[phase])
-
- # this is just setting the x to where it "should" be given the number
- # of cycles
- #pos[0] += (self.trajectory.qpos[-1, 0] - self.trajectory.qpos[0, 0]) * self.counter
- pos[0] += (self.trajectory.qpos[-1, 0] - self.trajectory.qpos[0, 0]) * self.counter
-
- # ^ should only matter for COM error calculation,
- # gets dropped out of state variable for input reasons
-
- # setting lateral distance target to 0?
- # regardless of reference trajectory?
- pos[1] = 0
-
- # vel = np.copy(self.trajectory.qvel[phase * self.simrate])
- vel = np.copy(self.trajectory.qvel[phase])
-
- return pos, vel
-
- def update_info(self, new_speed):
-
- self.speed = new_speed
-
- # find closest speed in [0.0, 0.1, ... 3.0]. use this to find new trajectory
- self.trajectory = self.trajectories[ (np.abs([speed_i - self.speed for speed_i in self.speeds])).argmin() ]
-
- # new offset
- ref_pos, ref_vel = self.get_ref_state(self.phase)
- self.offset = ref_pos[self.pos_idx]
-
- # phaselen
- self.phaselen = self.trajectory.length - 1
-
- return self.phaselen, self.offset
-
-time_log = [] # time stamp
-input_log = [] # network inputs
-output_log = [] # network outputs
-state_log = [] # cassie state
-target_log = [] # PD target log
-traj_log = [] # reference trajectory log
-
-policy_name = "aslip_unified_0_v2"
-filename = "logdata"
-directory = "./hardware_logs/" + policy_name + "/"
-if not os.path.exists(directory):
- os.makedirs(directory)
-filep = open("./hardware_logs/" + policy_name + "/" + filename + ".pkl", "wb")
-
-def log():
- data = {"time": time_log, "output": output_log, "input": input_log, "state": state_log, "target": target_log, "trajectory": traj_log}
- pickle.dump(data, filep)
-
-def isData():
- return select.select([sys.stdin], [], [], 0) == ([sys.stdin], [], [])
-
-atexit.register(log)
-
-# Prevent latency issues by disabling multithreading in pytorch
-torch.set_num_threads(1)
-
-# Prepare model
-# env = CassieEnv_speed_no_delta_neutral_foot("walking", clock_based=True, state_est=True)
-# env.reset_for_test()
-traj = TrajectoryInfo()
-
-# policy = torch.load("./trained_models/old_aslip/final_v1/aslip_unified_freq_correction.pt")
-policy = torch.load("./trained_models/" + policy_name + ".pt")
-policy.eval()
-
-max_speed = 2.0
-min_speed = 0.0
-max_y_speed = 0.5
-min_y_speed = -0.5
-
-# Initialize control structure with gains
-P = np.array([100, 100, 88, 96, 50, 100, 100, 88, 96, 50])
-D = np.array([10.0, 10.0, 8.0, 9.6, 5.0, 10.0, 10.0, 8.0, 9.6, 5.0])
-u = pd_in_t()
-for i in range(5):
- u.leftLeg.motorPd.pGain[i] = P[i]
- u.leftLeg.motorPd.dGain[i] = D[i]
- u.rightLeg.motorPd.pGain[i] = P[i+5]
- u.rightLeg.motorPd.dGain[i] = D[i+5]
-
-pos_index = np.array([2,3,4,5,6,7,8,9,14,15,16,20,21,22,23,28,29,30,34])
-vel_index = np.array([0,1,2,3,4,5,6,7,8,12,13,14,18,19,20,21,25,26,27,31])
-pos_mirror_index = np.array([2,3,4,5,6,21,22,23,28,29,30,34,7,8,9,14,15,16,20])
-vel_mirror_index = np.array([0,1,2,3,4,5,19,20,21,25,26,27,31,6,7,8,12,13,14,18])
-offset = traj.offset
-# offset = np.array([0.0045, 0.0, 0.4973, -1.1997, -1.5968, 0.0045, 0.0, 0.4973, -1.1997, -1.5968])
-
-# Determine whether running in simulation or on the robot
-if platform.node() == 'cassie':
- cassie = CassieUdp(remote_addr='10.10.10.3', remote_port='25010',
- local_addr='10.10.10.100', local_port='25011')
-else:
- cassie = CassieUdp() # local testing
-
-
-# Connect to the simulator or robot
-print('Connecting...')
-y = None
-while y is None:
- cassie.send_pd(pd_in_t())
- time.sleep(0.001)
- y = cassie.recv_newest_pd()
-received_data = True
-print('Connected!\n')
-
-# Record time
-t = time.monotonic()
-t0 = t
-
-orient_add = 0
-old_settings = termios.tcgetattr(sys.stdin)
-try:
- tty.setcbreak(sys.stdin.fileno())
- while True:
- # Wait until next cycle time
- while time.monotonic() - t < 60/2000:
- time.sleep(0.001)
- t = time.monotonic()
- tt = time.monotonic() - t0
-
- # Get newest state
- state = cassie.recv_newest_pd()
-
- if state is None:
- print('Missed a cycle')
- continue
-
- if platform.node() == 'cassie':
- # Radio control
- orient_add -= state.radio.channel[3] / 60.0
- print("orient add: ", orient_add)
- traj.speed = max(min_speed, state.radio.channel[0] * max_speed)
- traj.speed = min(max_speed, state.radio.channel[0] * max_speed)
- # traj.phase_add = state.radio.channel[5] + 1
- # env.y_speed = max(min_y_speed, -state.radio.channel[1] * max_y_speed)
- # env.y_speed = min(max_y_speed, -state.radio.channel[1] * max_y_speed)
- else:
- # Automatically change orientation and speed
- tt = time.monotonic() - t0
- orient_add += 0#math.sin(t / 8) / 400
- if isData():
- c = sys.stdin.read(1)
- if c == 'w':
- traj.speed += .1
- print("Increasing speed to: ", traj.speed)
- elif c == 's':
- traj.speed -= .1
- print("Decreasing speed to: ", traj.speed)
- # elif c == 'a':
- # y_speed += .1
- # print("Increasing y speed to: ", y_speed)
- # elif c == 'd':
- # y_speed -= .1
- # print("Decreasing y speed to: ", y_speed)
- elif c == 'j':
- traj.freq_adjust += .1
- print("Increasing frequency to: ", traj.freq_adjust)
- elif c == 'h':
- traj.freq_adjust -= .1
- print("Decreasing frequency to: ", traj.freq_adjust)
- elif c == 'l':
- orient_add += .1
- print("Increasing orient_add to: ", orient_add)
- elif c == 'k':
- orient_add -= .1
- print("Decreasing orient_add to: ", orient_add)
- elif c == 'p':
- print("Applying force")
- push = 100
- push_dir = 0
- force_arr = np.zeros(6)
- force_arr[push_dir] = push
- env.sim.apply_force(force_arr)
- else:
- pass
- traj.speed = max(min_speed, traj.speed)
- traj.speed = min(max_speed, traj.speed)
- # y_speed = max(min_y_speed, y_speed)
- # y_speed = min(max_y_speed, y_speed)
- # print("y_speed: ", y_speed)
- # print("frequency: ", traj.phase_add)
-
- traj.update_info(traj.speed)
-
- clock = [np.sin(2 * np.pi * traj.phase * traj.freq_adjust / traj.phaselen), np.cos(2 * np.pi * traj.phase * traj.freq_adjust / traj.phaselen)]
-
- # euler_orient = quaternion2euler(state.pelvis.orientation[:])
- # print("euler orient: ", euler_orient + np.array([orient_add, 0, 0]))
- # new_orient = euler2quat(euler_orient + np.array([orient_add, 0, 0]))
- quaternion = euler2quat(z=orient_add, y=0, x=0)
- iquaternion = inverse_quaternion(quaternion)
- new_orient = quaternion_product(iquaternion, state.pelvis.orientation[:])
- if new_orient[0] < 0:
- new_orient = -new_orient
- new_translationalVelocity = rotate_by_quaternion(state.pelvis.translationalVelocity[:], iquaternion)
- print("orig orientation: ", state.pelvis.orientation[:])
- print('new_orientation: {}'.format(new_orient))
-
- # ext_state = np.concatenate((clock, [speed, y_speed]))
- ext_state = np.concatenate((clock, [traj.speed] ))
- robot_state = np.concatenate([
- [state.pelvis.position[2] - state.terrain.height], # pelvis height
- new_orient,
- # state.pelvis.orientation[:], # pelvis orientation
- state.motor.position[:], # actuated joint positions
-
- # state.pelvis.translationalVelocity[:], # pelvis translational velocity
- new_translationalVelocity[:],
- state.pelvis.rotationalVelocity[:], # pelvis rotational velocity
- state.motor.velocity[:], # actuated joint velocities
-
- state.pelvis.translationalAcceleration[:], # pelvis translational acceleration
-
- state.joint.position[:], # unactuated joint positions
- state.joint.velocity[:] # unactuated joint velocities
- ])
- RL_state = np.concatenate([robot_state, ext_state])
-
- actual_speed = state.pelvis.translationalVelocity[0]
- print("target speed: {:.2f}\tactual speed: {:.2f}\tfreq: {}".format(traj.speed, actual_speed, traj.freq_adjust))
-
- #pretending the height is always 1.0
- # RL_state[0] = 1.0
-
- # Construct input vector
- torch_state = torch.Tensor(RL_state)
- # torch_state = shared_obs_stats.normalize(torch_state)
-
- # Get action
- action = policy.act(torch_state, True)
- env_action = action.data.numpy()
- target = env_action + traj.offset
-
- # print(state.pelvis.position[2] - state.terrain.height)
-
- # Send action
- for i in range(5):
- u.leftLeg.motorPd.pTarget[i] = target[i]
- u.rightLeg.motorPd.pTarget[i] = target[i+5]
- #time.sleep(0.005)
- cassie.send_pd(u)
-
- # Measure delay
- print('delay: {:6.1f} ms'.format((time.monotonic() - t) * 1000))
-
- # Logging
- time_log.append(time.time())
- state_log.append(state)
- input_log.append(RL_state)
- output_log.append(env_action)
- target_log.append(target)
- traj_log.append(traj.offset)
-
- # Track phase
- traj.phase += traj.phase_add
- if traj.phase >= traj.phaselen:
- traj.phase = 0
- traj.counter += 1
-
-finally:
- termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
\ No newline at end of file
diff --git a/deprecated/figures/16itrensembleplot.png b/deprecated/figures/16itrensembleplot.png
deleted file mode 100644
index bd0d0113..00000000
Binary files a/deprecated/figures/16itrensembleplot.png and /dev/null differ
diff --git a/deprecated/figures/16itrpolicyplot.png b/deprecated/figures/16itrpolicyplot.png
deleted file mode 100644
index 45f7fdcb..00000000
Binary files a/deprecated/figures/16itrpolicyplot.png and /dev/null differ
diff --git a/deprecated/figures/stable_reward=23.7.png b/deprecated/figures/stable_reward=23.7.png
deleted file mode 100644
index f233617a..00000000
Binary files a/deprecated/figures/stable_reward=23.7.png and /dev/null differ
diff --git a/deprecated/figures/weird_reference_trajectory.png b/deprecated/figures/weird_reference_trajectory.png
deleted file mode 100644
index 11533bf9..00000000
Binary files a/deprecated/figures/weird_reference_trajectory.png and /dev/null differ
diff --git a/deprecated/logging.py b/deprecated/logging.py
deleted file mode 100644
index f184d10f..00000000
--- a/deprecated/logging.py
+++ /dev/null
@@ -1,210 +0,0 @@
-"""
-A simple live-updating logger for logging training progress.
-
-Based largely off Berkely's DRL course HW4, which is itself inspired by rllab.
-https://github.com/berkeleydeeprlcourse/homework/blob/master/hw4/logz.py
-"""
-import matplotlib
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-plt.style.use('ggplot')
-
-from functools import partial
-import os.path as osp, shutil, time, atexit, os, subprocess, hashlib, sys
-import configparser
-from collections import OrderedDict
-import numpy as np
-
-import ray
-
-matplotlib.rcParams.update({'font.size': 8})
-
-#from scipy.signal import medfilt
-
-class Logger():
- def __init__(self, args, env_name, viz=True, viz_list=[]):
- self.ansi = dict(
- gray=30,
- red=31,
- green=32,
- yellow=33,
- blue=34,
- magenta=35,
- cyan=36,
- white=37,
- crimson=38
- )
-
- self.name = args.name
-
- self.viz_list = ["all"]
-
- self.args = args
-
- if viz:
- from visdom import Visdom
- self.viz = Visdom(port=args.viz_port)
- self.env = env_name
- self.plots = {}
- else:
- self.viz = None
-
- self.output_dir = self._get_directory(args)
- self.init = True
- self.header = []
- self.current_row = {}
-
- def _get_directory(self, args):
- """Use hyperparms to set a directory to output diagnostic files."""
- #get hyperparameters as dictionary
- arg_dict = args.__dict__
-
- assert "seed" in arg_dict, \
- "You must provide a 'seed' key in your command line arguments"
-
- assert "logdir" in arg_dict, \
- "You must provide a 'logdir' key in your command line arguments."
-
- #sort the keys so the same hyperparameters will always have the same hash
- arg_dict = OrderedDict(sorted(arg_dict.items(), key=lambda t: t[0]))
-
- #remove seed so it doesn't get hashed, store value for filename
- # same for logging directory
- seed = str(arg_dict.pop("seed"))
- logdir = str(arg_dict.pop('logdir'))
-
- # get a unique hash for the hyperparameter settings, truncated at 10 chars
- arg_hash = hashlib.md5(str(arg_dict).encode('ascii')).hexdigest()[0:10]
-
- output_dir = osp.join(logdir, arg_hash)
-
- # create a directory with the hyperparm hash as its name, if it doesn't
- # already exist.
- os.makedirs(output_dir, exist_ok=True)
-
- # create a file for this seed, this is where output will be logged
- filename = "seed" + seed + ".log"
-
- # currently logged-to directories will be pre-pended with "ACTIVE_"
- active_path = osp.join(output_dir, filename)
-
- # Create a file with all the hyperparam settings in plaintext
- info_path = osp.join(output_dir, "experiment.info")
- self._generate_info_file(open(info_path, 'w'), arg_dict)
-
- print(self._colorize("Logging data to %s" % active_path,
- 'green', bold=True))
-
- return active_path
-
- def record(self, key, val, x_val, title_name, x_var_name="Timesteps", split_name="train"):
- """
- Log some diagnostic value in current iteration.
-
- Call this exactly once for each diagnostic, every iteration
- """
- if self.init:
- self.header.append(key)
- # if self.viz is not None:
- # self.wins.append(None)
- else:
- assert key in self.header, \
- "Key %s not in header. All keys must be set in first iteration" % key
-
- assert key not in self.current_row, \
- "You already set key %s this iteration. Did you forget to call dump()?" % key
-
- self.current_row[key] = val
-
- if self.viz is not None:
- self.plot(key, x_var_name, split_name, title_name, x_val, val)
-
- def dump(self):
- """Write all of the diagnostics from the current iteration"""
- vals = []
-
- sys.stdout.write("-" * 37 + "\n")
- for key in self.header:
- val = self.current_row.get(key, "")
- if hasattr(val, "__float__"):
- valstr = "%8.3g" % val
- else:
- valstr = val
-
- sys.stdout.write("| %15s | %15s |" % (key, valstr) + "\n")
- vals.append(float(val))
- sys.stdout.write("-" * 37 + "\n")
- sys.stdout.flush()
-
- output_file = None
- if self.init:
- output_file = open(self.output_dir, "w")
-
- output_file.write("\t".join(self.header))
- output_file.write("\n")
- else:
- output_file = open(self.output_dir, "a")
-
- output_file.write("\t".join(map(str, vals)))
- output_file.write("\n")
- output_file.flush()
- output_file.close()
-
- self.current_row.clear()
-
- self.init = False
-
- # if self.viz is not None:
- # self.plot()
-
- def config_monitor(self, config_path=None):
- if config_path is None:
- config_path = os.path.join(os.path.dirname(__file__), "../config/monitor.ini")
-
- config = configparser.ConfigParser()
- config.read(config_path)
-
- return config["monitor"]
-
- def plot(self, var_name, x_var_name, split_name, title_name, x, y):
- if var_name not in self.plots:
- self.plots[var_name] = self.viz.line(X=np.array([x,x]), Y=np.array([y,y]), env=self.env, opts=dict(
- legend=[split_name],
- title=title_name,
- xlabel=x_var_name,
- ylabel=var_name
- ))
- else:
- self.viz.line(X=np.array([x]), Y=np.array([y]), env=self.env, win=self.plots[var_name], name=split_name, update = 'append')
-
- def _load_data(self):
- log_file = open(self.output_dir, 'r')
-
- header = log_file.readline().rstrip('\n').split('\t')
-
- data = []
- for line in log_file:
- vals = line.rstrip('\n').split('\t')
- vals = [float(val) for val in vals]
- data.append(vals)
-
- data = np.array(data)
-
- log_file.close()
- return data, header
-
- def _generate_info_file(self, file, arg_dict):
- for key, val in arg_dict.items():
- file.write("%s: %s" % (key, val))
- file.write('\n')
-
- def _colorize(self, string, color, bold=False, highlight=False):
- """Format string to print with color 'color' if printed to unix terminal."""
- attr = []
- num = self.ansi[color]
- if highlight:
- num += 10
- attr.append(str(num))
- if bold:
- attr.append('1')
- return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
\ No newline at end of file
diff --git a/deprecated/old/dagger.py b/deprecated/old/dagger.py
deleted file mode 100644
index fd4639e3..00000000
--- a/deprecated/old/dagger.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from rl.envs import SlipEnv
-from rl.policies import GaussianMLP
-from rl.algos import DAgger
-from rl.utils import policyplot
-
-import torch
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--dagger_itr", type=int, default=100,
- help="number of iterations of DAgger")
-parser.add_argument("--epochs", type=int, default=10,
- help="number of optimization epochs")
-parser.add_argument("--trj_len", type=int, default=10000,
- help="maximum trajectory length")
-parser.add_argument("--seed", type=int, default=1,
- help="random seed for experiment")
-args = parser.parse_args()
-
-if __name__ == "__main__":
- torch.manual_seed(args.seed)
-
- env = SlipEnv(0.001)
- obs_dim = env.observation_space.shape[0]
- action_dim = env.action_space.shape[0]
-
- learner = GaussianMLP(obs_dim, action_dim, (64,))
- algo = DAgger(env, learner, None)
-
- algo.train(
- dagger_itr=args.dagger_itr,
- epochs=args.epochs,
- trj_len=args.trj_len
- )
-
- policyplot(env, learner, args.trj_len)
diff --git a/deprecated/old/experiment.sh b/deprecated/old/experiment.sh
deleted file mode 100755
index 115f0161..00000000
--- a/deprecated/old/experiment.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-for i in {1..15}
-do
- python examples/ppo.py --name "modelv$i" --seed $RANDOM &
-done
\ No newline at end of file
diff --git a/deprecated/old/parallel_profiler.py b/deprecated/old/parallel_profiler.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/deprecated/old/vpg.py b/deprecated/old/vpg.py
deleted file mode 100644
index 1fb61122..00000000
--- a/deprecated/old/vpg.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""Python file for automatically running experiments from command line."""
-import argparse
-
-from rllab.envs.box2d.cartpole_env import CartpoleEnv
-from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv
-from rllab.envs.mujoco.walker2d_env import Walker2DEnv
-from rllab.envs.mujoco.hopper_env import HopperEnv
-from rllab.envs.mujoco.walker2d_env import Walker2DEnv
-from rllab.envs.gym_env import GymEnv
-from rllab.envs.normalized_env import normalize
-
-from rl.utils import run_experiment
-from rl.policies import GaussianA2C
-from rl.baselines import FeatureEncodingBaseline
-from rl.algos import VPG, CPI
-
-import gym
-import torch
-
-
-parser = argparse.ArgumentParser()
-
-CPI.add_arguments(parser)
-
-parser.add_argument("--seed", type=int, default=1,
- help="RNG seed")
-parser.add_argument("--logdir", type=str, default="/tmp/rl/experiments/",
- help="Where to log diagnostics to")
-
-args = parser.parse_args()
-
-if __name__ == "__main__":
- #env = normalize(Walker2DEnv())
-
- env = gym.make("Walker2d-v1")
-
- #env.seed(args.seed)
- #torch.manual_seed(args.seed)
-
- obs_dim = env.observation_space.shape[0]
- action_dim = env.action_space.shape[0]
-
- policy = GaussianA2C(obs_dim, action_dim, (32,))
-
- algo = CPI(
- env=env,
- policy=policy,
- lr=args.lr,
- tau=args.tau
- )
-
- run_experiment(
- algo=algo,
- args=args,
- log=True,
- monitor=False,
- render=True
- )
diff --git a/deprecated/plotting.py b/deprecated/plotting.py
deleted file mode 100644
index dfeb0e0f..00000000
--- a/deprecated/plotting.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""This screws up visualize.py"""
-"""
-import numpy as np
-from matplotlib import pyplot as plt
-from matplotlib.lines import Line2D
-
-from torch.autograd import Variable as Var
-from torch import Tensor
-
-
-class RealtimePlot():
- def __init__(self, style='ggplot'):
- plt.style.use(style)
- plt.ion()
- self.fig, self.ax = plt.subplots()
- self.xlim = 0
- self.yvals = []
- self.line = Line2D([], [])
- self.ax.add_line(self.line)
-
- def config(self, ylabel, xlabel):
- self.ax.set_ylabel(ylabel)
- self.ax.set_xlabel(xlabel)
- self.fig.tight_layout()
-
- def plot(self, y):
- self.yvals.append(y)
- self.line.set_data(np.arange(len(self.yvals)), self.yvals)
-
- self.ax.relim()
- self.ax.autoscale_view()
- self.ax.set_xlim(0, self.xlim)
- self.xlim += 1
-
- self.fig.canvas.flush_events()
-
- def done(self):
- plt.ioff()
- plt.show()
-
-
-def policyplot(env, policy, trj_len):
- obs_dim = env.observation_space.shape[0]
- action_dim = env.action_space.shape[0]
-
- y = np.zeros((trj_len, action_dim))
- X = np.zeros((trj_len, obs_dim))
-
- obs = env.reset()
- for t in range(trj_len):
- X[t, :] = obs
- action = policy(Var(Tensor(obs[None, :]))).data.numpy()[0]
- y[t, :] = action
- obs = env.step(action)[0]
-
- fig, axes = plt.subplots(1, action_dim)
-
- for a in range(action_dim):
- axes[a].plot(np.arange(trj_len), y[:, a])
-
- plt.show()
-"""
\ No newline at end of file
diff --git a/deprecated/renderpol.py b/deprecated/renderpol.py
deleted file mode 100644
index 69f092ff..00000000
--- a/deprecated/renderpol.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import sys
-sys.path.append("..") # Adds higher directory to python modules path.
-
-from rl.utils import renderpolicy, rendermultipolicy, renderpolicy_speedinput, rendermultipolicy_speedinput
-from cassie import CassieEnv
-
-import torch
-
-import numpy as np
-import os
-import time
-
-import argparse
-import pickle
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--path", type=str, default="./trained_models/ppo/Cassie-v0/7b7e24-seed0/", help="path to folder containing policy and run details")
-args = parser.parse_args()
-run_args = pickle.load(open(args.path + "experiment.pkl", "rb"))
-
-cassie_env = CassieEnv(traj=run_args.traj, clock_based=run_args.clock_based, state_est=run_args.state_est, dynamics_randomization=run_args.dyn_random, no_delta=run_args.no_delta)
-policy = torch.load(args.path + "actor.pt")
-policy.eval()
-
-# cassie_env = CassieEnv(traj="aslip", clock_based=False, state_est=True, dynamics_randomization=False, no_delta=False)
-# policy = torch.load(args.path + "aslip_unified_task10_v7.pt")
-# policy.eval()
-
-renderpolicy_speedinput(cassie_env, policy, deterministic=False, dt=0.05, speedup = 2)
\ No newline at end of file
diff --git a/deprecated/results/TD3_Cassie-mimic-v0_0.npy b/deprecated/results/TD3_Cassie-mimic-v0_0.npy
deleted file mode 100644
index 2efc4409..00000000
Binary files a/deprecated/results/TD3_Cassie-mimic-v0_0.npy and /dev/null differ
diff --git a/deprecated/setup.py b/deprecated/setup.py
deleted file mode 100644
index f1d377a2..00000000
--- a/deprecated/setup.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from setuptools import setup, find_packages
-
-#REQUIRED_PACKAGES = ["torch==0.4", "gym", "numpy", "visdom"]
-
-setup(name='apex',
- version='1.0',
- description='Deep reinforcement learning for continuous control',
- author='Pedro Morais, Yesh Godse, Jonah Siekmann',
- author_email='autranemorais@gmail.com, yesh.godse@gmail.com, siekmanj@oregonstate.edu',
- license='MIT',
- packages=find_packages(exclude=("tests")),
- #install_requires=REQUIRED_PACKAGES
-)
diff --git a/deprecated/test_ppo.py b/deprecated/test_ppo.py
deleted file mode 100644
index 95693684..00000000
--- a/deprecated/test_ppo.py
+++ /dev/null
@@ -1,524 +0,0 @@
-# TODO: organize this file
-import argparse
-import pickle
-import torch
-
-import time
-
-import functools
-
-# NOTE: importing cassie for some reason breaks openai gym, BUG ?
-from cassie import CassieEnv, CassieTSEnv, CassieIKEnv
-from cassie.no_delta_env import CassieEnv_nodelta
-from cassie.speed_env import CassieEnv_speed
-from cassie.speed_double_freq_env import CassieEnv_speed_dfreq
-from cassie.speed_no_delta_env import CassieEnv_speed_no_delta
-
-from rl.envs import Normalize, Vectorize
-from rl.policies import GaussianMLP, EnsemblePolicy
-
-import matplotlib
-from matplotlib import pyplot as plt
-plt.style.use('ggplot')
-
-import numpy as np
-np.set_printoptions(precision=2, suppress=True)
-
-from cassie.cassiemujoco import pd_in_t
-
-#TODO: convert all of this into ipynb
-
-
-# TODO: add .dt to all environments. OpenAI should do the same...
-def visualize(env, policy, trj_len, deterministic=True, dt=0.033, speedup=1):
- R = []
- r_ep = 0
- done = False
-
- with torch.no_grad():
- state = torch.Tensor(env.reset())
-
-
- for t in range(trj_len):
-
- #start = time.time()
- _, action = policy.act(state, deterministic)
- #print("policy time: ", time.time() - start)
-
- #start = time.time()
- state, reward, done, _ = env.step(action.data.numpy())
- #print("env time: ", time.time() - start)
-
- r_ep += reward
-
- if done:
- state = env.reset()
- R += [r_ep]
- r_ep = 0
-
- state = torch.Tensor(state)
-
- env.render()
-
- time.sleep(dt / speedup)
-
- if not done:
- R += [r_ep]
-
- print("avg reward:", sum(R)/len(R))
- print("avg timesteps:", trj_len / len(R))
-
-
-
-def cassie_policyplot(env, policy, trj_len, render=False, title=None):
- cassie_action = ["hip roll", "hip yaw", "hip pitch", "knee", "foot"]
-
- obs_dim = env.observation_space.shape[0]
- action_dim = env.action_space.shape[0]
-
- y_delta = np.zeros((trj_len, action_dim))
- y_ref = np.zeros((trj_len, action_dim))
- X = np.zeros((trj_len, action_dim)) # not real X
-
- with torch.no_grad():
- state = torch.Tensor(env.reset())
-
- for t in range(trj_len):
- _, action = policy.act(state, True)
-
- #X[t, :] = state.data.numpy()
- y_delta[t, :] = action.data.numpy() # policy delta
-
- # oooof this is messy/hackish
- ref_pos, _ = env.venv.envs[0].get_ref_state(env.venv.envs[0].phase)
- y_ref[t, :] = ref_pos[env.venv.envs[0].pos_idx] # base PD target
-
- X[t, :] = np.copy(env.venv.envs[0].sim.qpos())[env.venv.envs[0].pos_idx]
-
- state, reward, done, _ = env.step(action.data.numpy())
-
- state = torch.Tensor(state)
-
- if render:
- env.render()
- time.sleep(0.033)
-
- # one row for each leg
- plot_rows = 2
- plot_cols = action_dim // 2
-
- fig, axes = plt.subplots(plot_rows, plot_cols, figsize=(20, 10))
-
- if title is not None:
- fig.suptitle(title, fontsize=16)
-
- for r in range(plot_rows): # 2 legs
- for c in range(plot_cols): # 5 actions
- a = r * plot_cols + c
- axes[r][c].plot(np.arange(trj_len), y_delta[:, a], "C0", label="delta")
- axes[r][c].plot(np.arange(trj_len), y_ref[:, a], "C1", label="reference")
- axes[r][c].plot(np.arange(trj_len), y_delta[:, a] + y_ref[:, a], "C2--", label="summed")
-
- axes[r][c].plot(np.arange(trj_len), X[:, a], "g--", label="result")
-
- axes[0][c].set_xlabel(cassie_action[c])
- axes[0][c].xaxis.set_label_position('top')
- axes[r][0].set_ylabel(["left leg", "right leg"][r])
-
- plt.tight_layout()
-
- if title is not None:
- plt.subplots_adjust(top=0.93)
-
-
- axes[0][0].legend(loc='upper left')
- plt.show()
-
-
-def policyplot(env, policy, trj_len):
- obs_dim = env.observation_space.shape[0]
- action_dim = env.action_space.shape[0]
-
- y = np.zeros((trj_len, action_dim))
- X = np.zeros((trj_len, obs_dim))
-
- state = torch.Tensor(env.reset())
- for t in range(trj_len):
-
- _, action = policy.act(state, True)
-
- X[t, :] = state.data.numpy()
- y[t, :] = action.data.numpy()
-
- state, _, _, _ = env.step(action.data.numpy())
-
- fig, axes = plt.subplots(1, action_dim)
-
-
- for a in range(action_dim):
- axes[a].plot(np.arange(trj_len), y[:, a])
-
- plt.show()
-
-# TODO: add no_grad to all of these
-
-def get_rewards(env, policy, num_trj, deterministic=False):
- r_ep = 0
- R = []
- done = False
-
- state = torch.Tensor(env.reset())
- while len(R) < num_trj:
- _, action = policy.act(state, deterministic)
- state, reward, done, _ = env.step(action.data.numpy())
-
- r_ep += reward
-
- if done:
- state = env.reset()
- R += [r_ep[0]]
- r_ep = 0
-
-
- state = torch.Tensor(state)
-
- if not done:
- R += [r_ep[0]]
-
- return R
-
-# TODO: add n_trials arg, pass in policy function instead
-
-def plot_reward_dist(env, policy, num_trj, deterministic=False):
- r_ep = 0
- R = []
- done = False
-
- state = torch.Tensor(env.reset())
- while len(R) < num_trj:
- _, action = policy.act(state, deterministic)
- state, reward, done, _ = env.step(action.data.numpy())
-
- r_ep += reward
-
- if done:
- state = env.reset()
- R += [r_ep[0]]
- r_ep = 0
-
- state = torch.Tensor(state)
-
- if not done:
- R += [r_ep[0]]
-
- plt.xlabel("Return")
- plt.ylabel("Frequency")
- plt.hist(R, edgecolor='black', label=policy.__class__.__name__) # TODO: make labels less hacky
-
-def plot_action_dist(env, policy, num_trj, deterministic=False):
- n = 0
- A = []
- done = False
-
- state = torch.Tensor(env.reset())
- while n < num_trj:
- _, action = policy.act(state, deterministic)
- state, reward, done, _ = env.step(action.data.numpy())
-
- A += [action.mean().data.numpy()]
-
- if done:
- state = env.reset()
- n += 1
-
-
- state = torch.Tensor(state)
-
- print(np.std(A))
-
- plt.xlabel("Mean Action")
- plt.ylabel("Frequency")
- plt.hist(A, edgecolor='black', label=policy.__class__.__name__) # TODO: make labels less hacky
-
-def print_semantic_state(s):
- # only works for full state for now
- assert s.numel() == 80
-
- x = [None] * 20
-
- # TODO: add mask variable to env to specifiy which states are used
-
- x [ 0] = "Pelvis y"
- x [ 1] = "Pelvis z"
- x [ 2] = "Pelvis qw"
- x [ 3] = "Pelvis qx"
- x [ 4] = "Pelvis qy"
- x [ 5] = "Pelvis qz"
- x [ 6] = "Left hip roll" #(Motor [0])
- x [ 7] = "Left hip yaw" #(Motor [1])
- x [ 8] = "Left hip pitch" #(Motor [2])
- x [ 9] = "Left knee" #(Motor [3])
- x [10] = "Left shin" #(Joint [0])
- x [11] = "Left tarsus" #(Joint [1])
- x [12] = "Left foot" #(Motor [4], Joint [2])
- x [13] = "Right hip roll" #(Motor [5])
- x [14] = "Right hip yaw" #(Motor [6])
- x [15] = "Right hip pitch" #(Motor [7])
- x [16] = "Right knee" #(Motor [8])
- x [17] = "Right shin" #(Joint [3])
- x [18] = "Right tarsus" #(Joint [4])
- x [19] = "Right foot" #(Motor [9], Joint [5])
-
- x_dot = [None] * 20
-
- x_dot[ 0] = "Pelvis x"
- x_dot[ 1] = "Pelvis y"
- x_dot[ 2] = "Pelvis z"
- x_dot[ 3] = "Pelvis wx"
- x_dot[ 4] = "Pelvis wy"
- x_dot[ 5] = "Pelvis wz"
- x_dot[ 6] = "Left hip roll" #(Motor [0])
- x_dot[ 7] = "Left hip yaw" #(Motor [1])
- x_dot[ 8] = "Left hip pitch" #(Motor [2])
- x_dot[ 9] = "Left knee" #(Motor [3])
- x_dot[10] = "Left shin" #(Joint [0])
- x_dot[11] = "Left tarsus" #(Joint [1])
- x_dot[12] = "Left foot" #(Motor [4], Joint [2])
- x_dot[13] = "Right hip roll" #(Motor [5])
- x_dot[14] = "Right hip yaw" #(Motor [6])
- x_dot[15] = "Right hip pitch" #(Motor [7])
- x_dot[16] = "Right knee" #(Motor [8])
- x_dot[17] = "Right shin" #(Joint [3])
- x_dot[18] = "Right tarsus" #(Joint [4])
- x_dot[19] = "Right foot" #(Motor [9], Joint [5])
-
- s_obs = s[:, :40]
- s_ref = s[:, 40:]
-
- print("\nObserved position")
- for i in range(20):
- if s_obs[:, i].item() > 1:
- print("{0}: \r\t\t\t{1:.2f}".format(x[i], s_obs[:, i].item()))
-
- print("\nObserved velocity")
- for i in range(20):
- if s_obs[:, 20 + i].item() > 1:
- print("{0}: \r\t\t\t{1:.2f}".format(x_dot[i], s_obs[:, 20 + i].item()))
-
- print("\nReference position")
- for i in range(20):
- if s_ref[:, i].item() > 1:
- print("{0}: \r\t\t\t{1:.2f}".format(x[i], s_ref[:, i].item()))
-
- print("\nReference velocity")
- for i in range(20):
- if s_ref[:, 20 + i].item() > 1:
- print("{0}: \r\t\t\t{1:.2f}".format(x_dot[i], s_ref[:, 20 + i].item()))
-
-# my idea for extending perturbation saliency to dynamical systems
-def saliency(policy, state, naive=False):
- scores = torch.zeros_like(state)
- for i in range(state.size(1)):
- score = 0
-
- s_prime = state.clone()
- # sample values from (e - 0.25, e + 0.25)
- max_r = 0.25
-
- # change in value/related to number of samples
- dr = 0.05
-
- # current change
- r = -max_r # start at min r
- if not naive:
- while r <= max_r:
- # TODO: visualize s_prime
- # would require setting qpos and visualizing
- # setting qpos would require knowing the mapping of indices
- # from s to qpos
- s_prime[:, i] = state[:, i] + r
-
- v, a = policy.act(state)
- v_prime, a_prime = policy.act(s_prime)
-
- score += 0.5 * torch.norm(v - v_prime, 2)
-
- r += dr
- else:
- s_prime[:, i] = 0
-
- v, a = policy.act(state)
- v_prime, a_prime = policy.act(s_prime)
-
- score += 0.5 * torch.norm(v - v_prime, 2)
-
-
- score /= 2 * max_r
- scores[:, i] = score
- scores = scores / torch.max(scores) * 10
- print_semantic_state(scores)
-
-# more plots:
-# state visitation
-# value function
-# reward distribution
-
-def make_env_fn(state_est=False):
- def _thunk():
- #return CassieEnv("walking", clock_based=True, state_est=state_est)
- return CassieIKEnv("walking", clock_based=True)
- return _thunk
-
-
-parser = argparse.ArgumentParser(description="Run a model, including visualization and plotting.")
-parser.add_argument("-p", "--model_path", type=str, default="./trained_models/demo2.pt",
- help="File path for model to test")
-parser.add_argument("-x", "--no-visualize", dest="visualize", default=True, action='store_false',
- help="Don't render the policy.")
-parser.add_argument("-g", "--graph", dest="plot", default=False, action='store_true',
- help="Graph the output of the policy.")
-
-parser.add_argument("--glen", type=int, default=150,
- help="Length of trajectory to graph.")
-parser.add_argument("--vlen", type=int, default=75,
- help="Length of trajectory to visualize")
-
-parser.add_argument("--noise", default=False, action="store_true",
- help="Visualize policy with exploration.")
-
-parser.add_argument("--new", default=False, action="store_true",
- help="Visualize new (untrained) policy")
-
-args = parser.parse_args()
-
-
-# TODO: add command line arguments for normalization on/off, and for ensemble policy?
-
-if __name__ == "__main__":
- torch.set_num_threads(1) # see: https://github.com/pytorch/pytorch/issues/13757
-
- input()
-
- if args.new:
- env_fn = make_env_fn(state_est=args.state_est)
-
- # env_fn = make_cassie_env("walking", clock_based=True)
- # env_fn = functools.partial(CassieEnv_speed, "walking", clock_based=True, state_est=False)
- # env_fn = functools.partial(CassieEnv_nodelta, "walking", clock_based=True, state_est=False)
- # env_fn = functools.partial(CassieEnv_speed_dfreq, "walking", clock_based = True, state_est=args.state_est)
-
- env = Vectorize([env_fn])
-
- obs_dim = env_fn().observation_space.shape[0]
- action_dim = env_fn().action_space.shape[0]
-
- policy = GaussianMLP(obs_dim, action_dim, nonlinearity="relu", init_std=np.exp(-2), learn_std=False)
-
- # policy2 = ActorCriticNet(obs_dim, action_dim, [256, 256])
-
- # #print(policy, sum(p.numel() for p in policy.parameters()))
- # #print(policy2, sum(p.numel() for p in policy2.parameters()))
-
- # plot_action_dist(env, policy, 100)
- # plot_action_dist(env, policy2, 100)
- # plt.legend()
- # plt.show()
-
- # R1 = []
- # R2 = []
- # R3 = []
- # for _ in range(5):
- # R1 += get_rewards(env, GaussianMLP(obs_dim, action_dim, nonlinearity="relu", init_std=np.exp(-2), learn_std=False), 50)
- # R2 += get_rewards(env, ActorCriticNet(obs_dim, action_dim, [256, 256]), 50)
- # R3 += get_rewards(env, XieNet(obs_dim, action_dim, [256, 256]), 50)
-
-
- # plt.xlabel("Return")
- # plt.ylabel("Frequency")
- # plt.hist(R1, edgecolor='black', label="GaussianMLP")
- # plt.hist(R2, edgecolor='black', label="ActorCriticNet")
- # plt.hist(R3, edgecolor='black', label="XieNet")
- # plt.legend()
- # plt.show()
-
- # plot_reward_dist(env, policy2, 500)
-
- if args.visualize:
- visualize(env, policy, args.vlen, deterministic=not args.noise)
-
- if args.plot:
- cassie_policyplot(env, policy, args.glen)
-
- else:
- policy = torch.load(args.model_path)
- policy.train(0)
-
- env_fn = make_env_fn()
-
- env = env_fn()
-
- # while True:
- # state = torch.Tensor(env.reset())
- # print("phase: {0}".format(env.venv.envs[0].phase))
-
- # saliency(policy, state, naive=False)
-
- # env.venv.envs[0].sim.step_pd(pd_in_t())
-
- # env.render()
- # input()
-
- if args.visualize:
- visualize(env, policy, args.vlen, deterministic=not args.noise)
-
- if args.plot:
- cassie_policyplot(env, policy, args.glen)
-
- exit()
-
-
- # TODO: try averaging obs_norm? to seperate obs normalization for each
- # averaging obs_norm probably wont work as all policies are expecting different normalization parameters
- # ob normalization should therefore should either be part of the policy or somehow syncronized
- # across experiments. The former is easier to implement
- # policy could keep track of its own obs_rms and the env could take it in and update it?
- # ^ might break OpenAI gym compatibility
-
- # other possibility: maybe norm parameters converge on their own over time? Although if they did
- # an ensemble probably wouldn't change behavior
-
- # SOLUTION: give trained policies an ob_rms parameter to normalize their own observations,
- # keep normalization calculation in environment for parallelization
-
- # NOTE: reward normalization affects stuff
-
- #### Ensemble policy
- vpolicy = []
-
- mmean = 0
- mvar = 0
- for i in range(1, 16):
- policy = torch.load("trained_models/modelv{}.pt".format(i))
-
- vpolicy += [policy]
-
- print("visualizing policy {}".format(i))
- # if i == 15:
- visualize(env, policy, 1)
- #cassie_policyplot(env, policy, 100, "policy {}".format(i))
-
- mmean /= 10
- mvar /= 10
-
- env.ob_rms.mean = mmean
- env.ob_rms.var = mvar
-
- #for i in range(10):
-
- # NOTE: reference trajectory for ensemble policy looks weird?
- # Because it's the NORMALIZED reference trajectory^^
- # but still, weird
- policy = EnsemblePolicy(vpolicy)
-
- #visualize(env, policy, 100)
- cassie_policyplot(env, policy, 75, render=True)
diff --git a/deprecated/test_ray.py b/deprecated/test_ray.py
deleted file mode 100644
index be2dc3bb..00000000
--- a/deprecated/test_ray.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import ray
-import time
-
-@ray.remote
-class Actor():
- def __init__(self, memory_id, learner_id):
- self.counter = 0
- self.memory_id = memory_id
- self.learner_id = learner_id
-
- def collect_experience(self):
- bar = False
- while True:
- #print("collecting")
- # simulate load of stepping through environment
- time.sleep(0.01)
- # increment counter
- self.counter += 1
- # send new data to memory server
- self.memory_id.receive_experience.remote(self.counter)
-
- # periodically query learner
- if self.counter % 5 == 0:
- foo, bar = ray.get(self.learner_id.send_step_count.remote())
- print("got stuff from learner")
-
- if bar:
- print("quitting collection loop")
- return
-
-@ray.remote
-class MemoryServer():
- def __init__(self):
- self.counter = 0
-
- def receive_experience(self, data):
- self.counter += data
-
- def send_experience(self):
- return self.counter
-
-@ray.remote
-class Evaluator():
- def __init__(self):
- self.counter = 0
-
- def evaluate_loop(self, learner_id):
- bar = False
- while True:
- time.sleep(0.5)
- foo, bar = ray.get(learner_id.send_step_count.remote())
- print("Evaluating: {}".format(foo))
- if bar:
- return
-
-@ray.remote
-class Learner():
- def __init__(self, memory_id, eval_id):
- self.steps = 0
- self.done = False
- self.num_train_steps = 100
- self.memory_id = memory_id
- self.eval_id = eval_id
-
- def train(self):
- while self.steps < self.num_train_steps:
- # collect experience from memory
- memory_data = ray.get(self.memory_id.send_experience.remote())
- # simulate optimization
- time.sleep(0.25)
- # increment number of steps
- self.steps += 1
-
- print("exiting learner")
- self.done = True
-
- def send_step_count(self):
- return self.steps, self.done
-
-
-if __name__ == "__main__":
- ray.init()
-
- # create many Actors, one Memory Server, one Evaluator, one Learner
- memory_id = MemoryServer.remote()
- eval_id = Evaluator.remote()
- learner_id = Learner.remote(memory_id, eval_id)
- actor_ids = [Actor.remote(memory_id, learner_id) for _ in range(10)]
-
- # Start actor collection loop and learner train loop and evaluator loop
- futures = [actor_id.collect_experience.remote() for actor_id in actor_ids]
- futures.append(learner_id.train.remote())
- futures.append(eval_id.evaluate_loop.remote(learner_id))
-
- ray.wait(futures, num_returns=len(futures))
-
- print("All done.")
-
diff --git a/deprecated/test_recurrence.py b/deprecated/test_recurrence.py
deleted file mode 100644
index f8d1c484..00000000
--- a/deprecated/test_recurrence.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-import torch.nn as nn
-import numpy as np
-from rl.policies.recurrent import RecurrentNet
-
-x = torch.ones(1, 4)
-actor = RecurrentNet(4, 3)
-
-actions = []
-loss = nn.MSELoss()
-
-for i in range(5):
- actions.append(actor.forward(x))
- print(actions)
-
-actions = torch.cat([action for action in actions])
-print(actions)
-target = torch.randn(5, 3)
-
-grad = loss(actions, target)
-grad.backward()
-print("success!")
diff --git a/deprecated/test_reference_traj.py b/deprecated/test_reference_traj.py
deleted file mode 100644
index 0400c4e8..00000000
--- a/deprecated/test_reference_traj.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import matplotlib.pyplot as plt
-
-import numpy as np
-
-from cassie import CassieEnv, CassieTSEnv, UnifiedCassieIKEnv, CassieEnv_nodelta, CassieEnv_rand_dyn, CassieEnv_speed_dfreq
-
-# env_fn = partial(CassieIKEnv, "walking", clock_based=True, state_est=state_est, speed=speed)
-env = UnifiedCassieIKEnv("walking", clock_based=True, state_est=True)
-
-
-env.reset()
-# env.render()
-
-qposes = []
-qvels = []
-
-# manually set the environment's speed
-
-env.speed = 0.0
-
-print("\nTesting Stuff\n")
-
-print(env.phaselen)
-
-for i in range(env.phaselen):
-
- """
- output of get_ref_state for clock and state estimator
- np.concatenate([qpos[pos_index], qvel[vel_index], np.concatenate((clock, [self.speed]))]])
- """
-
- # get reference state
- pos, vel = env.get_ref_state(i)
-
- # extract qpos, qvel
- qposes.append(pos)
- qvels.append(vel)
-
-# plot the qpos
-fig = plt.figure(figsize=(10,10))
-ax = fig.add_subplot(111)
-ax.plot(qposes)
-plt.savefig('ref_qposes.png')
\ No newline at end of file
diff --git a/deprecated/tests/test_ppo.py b/deprecated/tests/test_ppo.py
deleted file mode 100644
index 8216cf32..00000000
--- a/deprecated/tests/test_ppo.py
+++ /dev/null
@@ -1,187 +0,0 @@
-from collections import defaultdict
-
-import pytest
-import numpy as np
-
-import torch
-
-from rl.policies import GaussianMLP
-from rl.algos import PPO
-
-class SampleTesterEnv:
- def __init__(self, obs_dim, action_dim, done_state=10, gamma=0.99):
- """
- A simple environment that unit tests whether or the
- experience buffer and trajectory sampling code are
- producing the correct output. This is to test for things
- like off-by-one errors in the experience buffers or
- reward-to-go calculations.
-
- In other words:
-
- Makes sure the "experience table" is of the form:
-
- --------------------
- s_0 | a_0 | r_0
- --------------------
- . . .
- . . .
- . . .
- --------------------
- s_T | a_T | r_T
- --------------------
- s_T+1 | |
- --------------------
-
- where entries are defined by the MDP transitions:
-
- s_0 -> (s_1, a_0, r_0) -> ... -> (s_T+1, a_T, r_T)
-
- """
-
- self.observation_space = np.zeros(obs_dim)
- self.action_space = np.zeros(action_dim)
-
- self.state = 0
- self.obs_dim = obs_dim
-
- # TODO: add GAE support?
- self.gamma = gamma
-
- self.done = done_state
-
- self.actions = []
-
- def step(self, action):
- self.state += 1
-
- output = np.ones(shape=(1, self.obs_dim)) * self.state
-
- done = (self.state % self.done) == 0
-
- #print("real: ", done)
-
- # the first reward corresponds to the second state
- reward = np.ones(shape=(1, 1)) * (self.state - 1)
-
- self.actions.append(action.squeeze(0)) # TODO
-
- return output, reward, done, None
-
- def reset(self):
- self.state = 0
-
- output = np.ones(shape=(1, self.obs_dim)) * self.state
- return output
-
-
-@pytest.mark.parametrize("num_steps, obs_dim, action_dim", [
- (5, 1, 1),
- (10, 1, 1),
- (25, 1, 1),
- (10, 80, 10),
- (30, 80, 10),
- (35, 80, 10)
-])
-def test_ppo_sample(num_steps, obs_dim, action_dim):
- # useful for debugging
- np.set_printoptions(threshold=10000)
- torch.set_printoptions(threshold=10000)
-
- # TODO: test value est bootstrap for truncated trajectories
-
- gamma = 0.99
-
- env = SampleTesterEnv(obs_dim=obs_dim, action_dim=action_dim, gamma=gamma)
- policy = GaussianMLP(obs_dim, action_dim)
-
- # don't need to specify args that don't affect ppo.sample()
- args = defaultdict(lambda: None, {'gamma': gamma})
-
- algo = PPO(args)
-
- memory = algo.sample(env, policy, num_steps, 100)
-
- states, actions, rewards, returns = map(torch.Tensor,
- (memory.states, memory.actions, memory.rewards, memory.returns)
- )
-
- num_steps = states.shape[0]
-
- assert states.shape == (num_steps, obs_dim)
- assert actions.shape == (num_steps, action_dim)
- assert rewards.shape == (num_steps, 1)
- assert rewards.shape == (num_steps, 1)
-
- expected_states = np.array([(np.ones(shape=(obs_dim,)) * (s % env.done)) for s in range(num_steps)])
- assert np.allclose(states, expected_states)
-
- expected_rewards = np.array([(np.ones(shape=(1)) * (s % env.done)) for s in range(num_steps)])
- assert np.allclose(rewards, expected_rewards)
-
- expected_actions = np.array(env.actions)
- assert np.allclose(actions, expected_actions)
-
- expected_returns, R = [], 0
- for r in reversed(expected_rewards):
- R = R * gamma + r
-
- expected_returns.insert(0, R.copy())
-
- if r == 0: # this only happens on initial state, so restart the return
- R = 0
-
- expected_returns = np.array(expected_returns)
- assert np.allclose(returns, expected_returns)
-
-
-@pytest.mark.parametrize("num_steps, obs_dim, action_dim", [
- (5, 1, 1),
- (10, 1, 1),
- (25, 1, 1),
- (10, 80, 10),
- (30, 80, 10),
- (35, 80, 10)
-])
-def test_ppo_sample_parallel(num_steps, obs_dim, action_dim):
- # useful for debugging
- np.set_printoptions(threshold=10000)
- torch.set_printoptions(threshold=10000)
-
- # TODO: test value est bootstrap for truncated trajectories
-
- gamma = 0.99
-
- from functools import partial
-
- env = SampleTesterEnv(obs_dim=obs_dim, action_dim=action_dim, gamma=gamma)
- env_fn = partial(
- SampleTesterEnv,
- obs_dim=obs_dim,
- action_dim=action_dim,
- gamma=gamma
- )
-
- policy = GaussianMLP(obs_dim, action_dim)
-
- # don't need to specify args that don't affect ppo.sample()
- args = defaultdict(lambda: None, {'gamma': gamma, 'num_procs': 4})
-
- algo = PPO(args)
-
- memory = algo.sample_parallel(env_fn, policy, num_steps, 100)
-
- expected_memory = algo.sample(env, policy, 40, 100)
-
- #breakpoint()
-
- assert np.allclose(memory.states, expected_memory.states)
- #assert np.allclose(memory.actions, expected_memory.actions)
- assert np.allclose(memory.rewards, expected_memory.rewards)
- #assert np.allclose(memory.returns, expected_memory.returns)
- assert np.allclose(memory.returns, expected_memory.returns)
-
- assert np.allclose(memory.ep_returns, expected_memory.ep_returns)
- assert np.allclose(memory.ep_lens, expected_memory.ep_lens)
-
-test_ppo_sample_parallel(5, 1, 1)
\ No newline at end of file
diff --git a/deprecated/validate.py b/deprecated/validate.py
deleted file mode 100644
index ea0670e1..00000000
--- a/deprecated/validate.py
+++ /dev/null
@@ -1,212 +0,0 @@
-#validate the sensitivity of policy to environment parameters
-
-##########To do#########
-#Formalize desired output
-#Get to run with user input of env and policy without iteration
-#Iterate through each parameter
-
-def iterativeValidation(policy, env, max_traj_len=1000, visualize=True, env_name=None, speed=0.0, state_est=True, clock_based=False):
- # Follow apex.py and use an env_factory, or expect an env on input?
-
- #if env_name is None:
- # env = env_factory(policy.env_name, speed=speed, state_est=state_est, clock_based=clock_based)()
- #else:
- # env = env_factory(env_name, speed=speed, state_est=state_est, clock_based=clock_based)()
-
- # This stuff was ripped out from Jonah's dynamics randomization.
- # We are leaving it as is while building the infrastructure for testing.
- # Future plan: Put all the damp/mass ranges into a tuple and iterate through,
- # at each iteration running a test with the current parameter randomized, and all
- # others default. Then log results and output in a sane and useful way.
-
- damp = env.default_damping
- weak_factor = 0.5
- strong_factor = 1.5
- pelvis_damp_range = [[damp[0], damp[0]],
- [damp[1], damp[1]],
- [damp[2], damp[2]],
- [damp[3], damp[3]],
- [damp[4], damp[4]],
- [damp[5], damp[5]]] # 0->5
-
- hip_damp_range = [[damp[6]*weak_factor, damp[6]*strong_factor],
- [damp[7]*weak_factor, damp[7]*strong_factor],
- [damp[8]*weak_factor, damp[8]*strong_factor]] # 6->8 and 19->21
-
- achilles_damp_range = [[damp[9]*weak_factor, damp[9]*strong_factor],
- [damp[10]*weak_factor, damp[10]*strong_factor],
- [damp[11]*weak_factor, damp[11]*strong_factor]] # 9->11 and 22->24
-
- knee_damp_range = [[damp[12]*weak_factor, damp[12]*strong_factor]] # 12 and 25
- shin_damp_range = [[damp[13]*weak_factor, damp[13]*strong_factor]] # 13 and 26
- tarsus_damp_range = [[damp[14], damp[14]]] # 14 and 27
- heel_damp_range = [[damp[15], damp[15]]] # 15 and 28
- fcrank_damp_range = [[damp[16]*weak_factor, damp[16]*strong_factor]] # 16 and 29
- prod_damp_range = [[damp[17], damp[17]]] # 17 and 30
- foot_damp_range = [[damp[18]*weak_factor, damp[18]*strong_factor]] # 18 and 31
-
- side_damp = hip_damp_range + achilles_damp_range + knee_damp_range + shin_damp_range + tarsus_damp_range + heel_damp_range + fcrank_damp_range + prod_damp_range + foot_damp_range
- damp_range = pelvis_damp_range + side_damp + side_damp
- damp_noise = [np.random.uniform(a, b) for a, b in damp_range]
-
- hi = 1.3
- lo = 0.7
- m = env.default_mass
- pelvis_mass_range = [[lo*m[1], hi*m[1]]] # 1
- hip_mass_range = [[lo*m[2], hi*m[2]], # 2->4 and 14->16
- [lo*m[3], hi*m[3]],
- [lo*m[4], hi*m[4]]]
-
- achilles_mass_range = [[lo*m[5], hi*m[5]]] # 5 and 17
- knee_mass_range = [[lo*m[6], hi*m[6]]] # 6 and 18
- knee_spring_mass_range = [[lo*m[7], hi*m[7]]] # 7 and 19
- shin_mass_range = [[lo*m[8], hi*m[8]]] # 8 and 20
- tarsus_mass_range = [[lo*m[9], hi*m[9]]] # 9 and 21
- heel_spring_mass_range = [[lo*m[10], hi*m[10]]] # 10 and 22
- fcrank_mass_range = [[lo*m[11], hi*m[11]]] # 11 and 23
- prod_mass_range = [[lo*m[12], hi*m[12]]] # 12 and 24
- foot_mass_range = [[lo*m[13], hi*m[13]]] # 13 and 25
-
- side_mass = hip_mass_range + achilles_mass_range \
- + knee_mass_range + knee_spring_mass_range \
- + shin_mass_range + tarsus_mass_range \
- + heel_spring_mass_range + fcrank_mass_range \
- + prod_mass_range + foot_mass_range
-
- mass_range = [[0, 0]] + pelvis_mass_range + side_mass + side_mass
- mass_noise = [np.random.uniform(a, b) for a, b in mass_range]
-
- delta_y_min, delta_y_max = env.default_ipos[4] - 0.07, env.default_ipos[4] + 0.07
- delta_z_min, delta_z_max = env.default_ipos[5] - 0.04, env.default_ipos[5] + 0.04
- com_noise = [0, 0, 0] + [np.random.uniform(-0.25, 0.06)] + [np.random.uniform(delta_y_min, delta_y_max)] + [np.random.uniform(delta_z_min, delta_z_max)] + list(env.default_ipos[6:])
-
- fric_noise = [np.random.uniform(0.4, 1.4)] + [np.random.uniform(3e-3, 8e-3)] + list(env.default_fric[2:])
-
- env.sim.set_dof_damping(np.clip(damp_noise, 0, None))
- env.sim.set_body_mass(np.clip(mass_noise, 0, None))
- env.sim.set_body_ipos(com_noise)
- env.sim.set_ground_friction(np.clip(fric_noise, 0, None))
-
- # From policy_eval
- while True:
- state = env.reset()
- done = False
- timesteps = 0
- eval_reward = 0
- while not done and timesteps < max_traj_len:
- if hasattr(env, 'simrate'):
- start = time.time()
-
- action = policy.forward(torch.Tensor(state)).detach().numpy()
- state, reward, done, _ = env.step(action)
- if visualize:
- env.render()
- eval_reward += reward
- timesteps += 1
-
- if hasattr(env, 'simrate'):
- # assume 30hz (hack)
- end = time.time()
- delaytime = max(0, 1000 / 30000 - (end-start))
- time.sleep(delaytime)
-
- print("Eval reward: ", eval_reward)
-
-# Testing to see if the above is even working
-
-from apex import env_factory
-import torch
-env = env_factory("CassieIK-v0")
-#policy = torch.load("./trained_models/ddpg/ddpg_actor.pt")
-policy = torch.load("./logs/ddpg/Hopper-v3/538e63-seed0/actor.pt")
-iterativeValidation(policy, env)
-
-#nbody layout:
-# 0: worldbody (zero)
-# 1: pelvis
-
-# 2: left hip roll
-# 3: left hip yaw
-# 4: left hip pitch
-# 5: left achilles rod
-# 6: left knee
-# 7: left knee spring
-# 8: left shin
-# 9: left tarsus
-# 10: left heel spring
-# 12: left foot crank
-# 12: left plantar rod
-# 13: left foot
-
-# 14: right hip roll
-# 15: right hip yaw
-# 16: right hip pitch
-# 17: right achilles rod
-# 18: right knee
-# 19: right knee spring
-# 20: right shin
-# 21: right tarsus
-# 22: right heel spring
-# 23: right foot crank
-# 24: right plantar rod
-# 25: right foot
-
-
-# qpos layout
-# [ 0] Pelvis x
-# [ 1] Pelvis y
-# [ 2] Pelvis z
-# [ 3] Pelvis orientation qw
-# [ 4] Pelvis orientation qx
-# [ 5] Pelvis orientation qy
-# [ 6] Pelvis orientation qz
-# [ 7] Left hip roll (Motor [0])
-# [ 8] Left hip yaw (Motor [1])
-# [ 9] Left hip pitch (Motor [2])
-# [10] Left achilles rod qw
-# [11] Left achilles rod qx
-# [12] Left achilles rod qy
-# [13] Left achilles rod qz
-# [14] Left knee (Motor [3])
-# [15] Left shin (Joint [0])
-# [16] Left tarsus (Joint [1])
-# [17] Left heel spring
-# [18] Left foot crank
-# [19] Left plantar rod
-# [20] Left foot (Motor [4], Joint [2])
-# [21] Right hip roll (Motor [5])
-# [22] Right hip yaw (Motor [6])
-# [23] Right hip pitch (Motor [7])
-# [24] Right achilles rod qw
-# [25] Right achilles rod qx
-# [26] Right achilles rod qy
-# [27] Right achilles rod qz
-# [28] Right knee (Motor [8])
-# [29] Right shin (Joint [3])
-# [30] Right tarsus (Joint [4])
-# [31] Right heel spring
-# [32] Right foot crank
-# [33] Right plantar rod
-# [34] Right foot (Motor [9], Joint [5])
-
-# qvel layout
-# [ 0] Pelvis x
-# [ 1] Pelvis y
-# [ 2] Pelvis z
-# [ 3] Pelvis orientation wx
-# [ 4] Pelvis orientation wy
-# [ 5] Pelvis orientation wz
-# [ 6] Left hip roll (Motor [0])
-# [ 7] Left hip yaw (Motor [1])
-# [ 8] Left hip pitch (Motor [2])
-# [ 9] Left knee (Motor [3])
-# [10] Left shin (Joint [0])
-# [11] Left tarsus (Joint [1])
-# [12] Left foot (Motor [4], Joint [2])
-# [13] Right hip roll (Motor [5])
-# [14] Right hip yaw (Motor [6])
-# [15] Right hip pitch (Motor [7])
-# [16] Right knee (Motor [8])
-# [17] Right shin (Joint [3])
-# [18] Right tarsus (Joint [4])
-# [19] Right foot (Motor [9], Joint [5])
diff --git a/img/output.gif b/img/output.gif
new file mode 100644
index 00000000..ba560fc4
Binary files /dev/null and b/img/output.gif differ
diff --git a/img/output2.gif b/img/output2.gif
new file mode 100644
index 00000000..c8d09b97
Binary files /dev/null and b/img/output2.gif differ
diff --git a/plot_policy.py b/plot_policy.py
index 3d127666..ef82776b 100644
--- a/plot_policy.py
+++ b/plot_policy.py
@@ -4,11 +4,7 @@
from torch.autograd import Variable
import time, os, sys
-from util import env_factory
-<<<<<<< HEAD
-from rl.policies.actor import Gaussian_FF_Actor
-=======
->>>>>>> ed7815cacac97f21f6b66cbbf8f22c610e5f1d19
+from util.env import env_factory
import argparse
import pickle
@@ -28,30 +24,40 @@
print("pre steps: {}\tnum_steps: {}".format(args.pre_steps, args.num_steps))
print("pre speed: {}\tplot_speed: {}".format(args.pre_speed, args.plot_speed))
-# RUN_NAME = run_args.run_name if run_args.run_name != None else "plot"
-
-# RUN_NAME = "7b7e24-seed0"
-# POLICY_PATH = "../trained_models/ppo/Cassie-v0/" + RUN_NAME + "/actor.pt"
-
-# Load environment and policy
-if (not hasattr('run_args', 'simrate')):
- run_args.simrate = 50
-env_fn = env_factory(run_args.env_name, traj=run_args.traj, simrate=run_args.simrate, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random,
- mirror=False, clock_based=run_args.clock_based, reward="iros_paper", history=run_args.history)
+# Load environment
+env_fn = env = env_factory(
+ run_args.env_name,
+ command_profile=run_args.command_profile,
+ input_profile=run_args.input_profile,
+ simrate=run_args.simrate,
+ dynamics_randomization=run_args.dyn_random,
+ mirror=run_args.mirror,
+ learn_gains=run_args.learn_gains,
+ reward=run_args.reward,
+ history=run_args.history,
+ no_delta=run_args.no_delta,
+ traj=run_args.traj,
+ ik_baseline=run_args.ik_baseline
+)
cassie_env = env_fn()
+
+# Load policy
policy = torch.load(os.path.join(args.path, "actor.pt"))
policy.eval()
-
if hasattr(policy, 'init_hidden_state'):
policy.init_hidden_state()
-obs_dim = cassie_env.observation_space.shape[0] # TODO: could make obs and ac space static properties
+obs_dim = cassie_env.observation_space.shape[0] # TODO: could make obs and ac space static properties
action_dim = cassie_env.action_space.shape[0]
-no_delta = cassie_env.no_delta
+if hasattr(cassie_env, "no_delta"):
+ no_delta = cassie_env.no_delta
+else:
+ no_delta = True
limittargs = False
lininterp = False
offset = cassie_env.offset
+learn_gains = cassie_env.learn_gains
num_steps = args.num_steps
pre_steps = args.pre_steps
@@ -89,6 +95,8 @@
cassie_env.update_speed(args.plot_speed)
for i in range(num_steps):
action = policy.forward(torch.Tensor(state), deterministic=True).detach().numpy()
+ if learn_gains:
+ action, gain_deltas = action[:10], action[10:]
lin_steps = int(60 * 3/4) # Number of steps to interpolate over. Should be between 0 and self.simrate
alpha = 1 / lin_steps
for j in range(simrate):
@@ -122,7 +130,10 @@
zero2zero_clock = 0.5*(np.cos(2*np.pi/(cassie_env.phaselen+1)*(cassie_env.phase-(cassie_env.phaselen+1)/2)) + 1)
one2one_clock = 0.5*(np.cos(2*np.pi/(cassie_env.phaselen+1)*cassie_env.phase) + 1)
- cassie_env.step_simulation(real_action)
+ if learn_gains:
+ cassie_env.step_simulation(real_action, learned_gains=gain_deltas)
+ else:
+ cassie_env.step_simulation(real_action)
curr_qpos = cassie_env.sim.qpos()
curr_qvel = cassie_env.sim.qvel()
motor_pos[i*simrate+j, :] = np.array(curr_qpos)[cassie_env.pos_idx]
diff --git a/rl/algos/__init__.py b/rl/algos/__init__.py
index 9afe683a..8b137891 100644
--- a/rl/algos/__init__.py
+++ b/rl/algos/__init__.py
@@ -1,3 +1 @@
-from .ars import ARS
-from .ppo import PPO
-from .dpg import DPG
+
diff --git a/rl/algos/ars.py b/rl/algos/ars.py
index 33485793..c371ffc0 100644
--- a/rl/algos/ars.py
+++ b/rl/algos/ars.py
@@ -3,7 +3,8 @@
import ray
import time
-from apex import env_factory, create_logger
+from util.env import env_factory
+from util.log import create_logger
# This function adapted from https://github.com/modestyachts/ARS/blob/master/code/shared_noise.py
# (Thanks to Horia Mania)
diff --git a/rl/algos/async_td3.py b/rl/algos/async_td3.py
index 7ae9956e..f59917f6 100644
--- a/rl/algos/async_td3.py
+++ b/rl/algos/async_td3.py
@@ -4,7 +4,7 @@
from rl.policies.critic import Dual_Q_Critic as Critic
# Plot results
-from apex import create_logger
+from util.log import create_logger
import time
import os
@@ -26,7 +26,8 @@
# TODO: create way to resume experiment by loading actor and critic pt files
def run_experiment(args):
torch.set_num_threads(1)
- from apex import env_factory, create_logger
+ from util.env import env_factory
+ from util.log import create_logger
# Start ray
ray.init(num_gpus=0, include_webui=True, redis_address=args.redis_address)
diff --git a/rl/algos/ppo.py b/rl/algos/ppo.py
index 6b8b4f67..2b7e55ff 100644
--- a/rl/algos/ppo.py
+++ b/rl/algos/ppo.py
@@ -505,19 +505,11 @@ def train(self,
self.save(policy, critic)
def run_experiment(args):
- from util import env_factory, create_logger
-
- # torch.set_num_threads(1)
-
- if args.ik_baseline and args.no_delta:
- args.ik_baseline = False
-
- # TODO: remove this at some point once phase_based is stable
- if args.phase_based:
- args.clock_based = False
+ from util.env import env_factory
+ from util.log import create_logger
# wrapper function for creating parallelized envs
- env_fn = env_factory(args.env_name, traj=args.traj, simrate=args.simrate, phase_based=args.phase_based, clock_based=args.clock_based, state_est=args.state_est, no_delta=args.no_delta, learn_gains=args.learn_gains, ik_baseline=args.ik_baseline, dynamics_randomization=args.dyn_random, mirror=args.mirror, reward=args.reward, history=args.history)
+ env_fn = env_factory(args.env_name, simrate=args.simrate, command_profile=args.command_profile, input_profile=args.input_profile, learn_gains=args.learn_gains, dynamics_randomization=args.dyn_random, reward=args.reward, history=args.history, mirror=args.mirror, ik_baseline=args.ik_baseline, no_delta=args.no_delta, traj=args.traj)
obs_dim = env_fn().observation_space.shape[0]
action_dim = env_fn().action_space.shape[0]
@@ -566,20 +558,6 @@ def run_experiment(args):
algo = PPO(args=vars(args), save_path=logger.dir)
- print()
- print("Environment: {}".format(args.env_name))
- print(" ├ traj: {}".format(args.traj))
- print(" ├ phase_based: {}".format(args.phase_based))
- print(" ├ clock_based: {}".format(args.clock_based))
- print(" ├ state_est: {}".format(args.state_est))
- print(" ├ dyn_random: {}".format(args.dyn_random))
- print(" ├ no_delta: {}".format(args.no_delta))
- print(" ├ mirror: {}".format(args.mirror))
- print(" ├ ik baseline: {}".format(args.ik_baseline))
- print(" ├ learn gains: {}".format(args.learn_gains))
- print(" ├ reward: {}".format(env_fn().reward_func))
- print(" └ obs_dim: {}".format(obs_dim))
-
print()
print("Synchronous Distributed Proximal Policy Optimization:")
print(" ├ recurrent: {}".format(args.recurrent))
diff --git a/test_policy.py b/test_policy.py
index e2750a30..dbcb11ae 100644
--- a/test_policy.py
+++ b/test_policy.py
@@ -1,12 +1,11 @@
-from cassie import CassieEnv, CassiePlayground
-from rl.policies.actor import GaussianMLP_Actor
+from cassie import CassiePlayground
from tools.test_commands import *
from tools.eval_perturb import *
from tools.eval_mission import *
from tools.compare_pols import *
from tools.eval_sensitivity import *
from collections import OrderedDict
-from util import env_factory
+from util.env import env_factory
import torch
import pickle
@@ -47,8 +46,20 @@
# env_fn = partial(CassieEnv, traj=run_args.traj, clock_based=run_args.clock_based, state_est=run_args.state_est, dynamics_randomization=run_args.dyn_random)
# Make mirror False so that env_factory returns a regular wrap env function and not a symmetric env function that can be called to return
# a cassie environment (symmetric env cannot be called to make another env)
-env_fn = env_factory(run_args.env_name, traj=run_args.traj, simrate=run_args.simrate, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random,
- mirror=False, clock_based=run_args.clock_based, reward=run_args.reward, history=run_args.history)
+env_fn = env = env_factory(
+ run_args.env_name,
+ command_profile=run_args.command_profile,
+ input_profile=run_args.input_profile,
+ simrate=run_args.simrate,
+ dynamics_randomization=run_args.dyn_random,
+ mirror=run_args.mirror,
+ learn_gains=run_args.learn_gains,
+ reward=run_args.reward,
+ history=run_args.history,
+ no_delta=run_args.no_delta,
+ traj=run_args.traj,
+ ik_baseline=run_args.ik_baseline
+)
cassie_env = env_fn()
policy = torch.load(os.path.join(args.path, "actor.pt"))
if args.eval:
@@ -56,25 +67,24 @@
if hasattr(policy, 'init_hidden_state'):
policy.init_hidden_state()
-
-#TODO: make returning/save data in file inside function consist for all testing functions
+# TODO: make returning/save data in file inside function consist for all testing functions
def test_commands(cassie_env, policy, args):
print("Testing speed and orient commands")
if args.n_procs == 1:
- save_data = eval_commands(cassie_env, policy, num_steps=args.n_steps, num_commands=args.n_commands,
+ save_data = eval_commands(cassie_env, policy, num_steps=args.n_steps, num_commands=args.n_commands,
max_speed=args.max_speed, min_speed=args.min_speed, num_iters=args.n_iter)
np.save(os.path.join(args.path, "eval_commands.npy"), save_data)
else:
- eval_commands_multi(env_fn, policy, num_steps=args.n_steps, num_commands=args.n_commands, max_speed=args.max_speed,
+ eval_commands_multi(env_fn, policy, num_steps=args.n_steps, num_commands=args.n_commands, max_speed=args.max_speed,
min_speed=args.min_speed, num_iters=args.n_iter, num_procs=args.n_procs, filename=os.path.join(args.path, "eval_commands.npy"))
def test_perturbs(cassie_env, policy, args):
print("Testing perturbations")
if args.n_procs == 1:
- save_data = compute_perturbs(cassie_env, policy, wait_time=args.wait_time, perturb_duration=args.pert_dur, perturb_size=args.pert_size,
+ save_data = compute_perturbs(cassie_env, policy, wait_time=args.wait_time, perturb_duration=args.pert_dur, perturb_size=args.pert_size,
perturb_incr=args.pert_incr, perturb_body=args.pert_body, num_angles=args.num_angles)
else:
- save_data = compute_perturbs_multi(env_fn, policy, wait_time=args.wait_time, perturb_duration=args.pert_dur, perturb_size=args.pert_size,
+ save_data = compute_perturbs_multi(env_fn, policy, wait_time=args.wait_time, perturb_duration=args.pert_dur, perturb_size=args.pert_size,
perturb_incr=args.pert_incr, perturb_body=args.pert_body, num_angles=args.num_angles, num_procs=args.n_procs)
np.save(os.path.join(args.path, "eval_perturbs.npy"), save_data)
@@ -92,7 +102,7 @@ def test_perturbs(cassie_env, policy, args):
if not args.viz:
print("Testing missions")
save_data = []
-
+
for mission in missions:
print(mission + " mission:")
cassie_env = CassiePlayground(traj=run_args.traj, clock_based=run_args.clock_based, state_est=run_args.state_est, dynamics_randomization=run_args.dyn_random, mission=mission)
diff --git a/trained_models/nodelta_neutral_StateEst_symmetry_speed0-3_freq1-2/experiment.info b/trained_models/nodelta_neutral_StateEst_symmetry_speed0-3_freq1-2/experiment.info
index 5fa94265..5d3940b6 100644
--- a/trained_models/nodelta_neutral_StateEst_symmetry_speed0-3_freq1-2/experiment.info
+++ b/trained_models/nodelta_neutral_StateEst_symmetry_speed0-3_freq1-2/experiment.info
@@ -1,31 +1,13 @@
-algo_name: ppo
-clip: 0.2
-clock_based: True
+command_profile: clock
dyn_random: False
-entropy_coeff: 0.0
env_name: Cassie-v0
-epochs: 5
-eps: 1e-05
-gamma: 0.99
history: 0
-input_norm_steps: 100
-lam: 0.95
-lr: 0.0001
-max_grad_norm: 0.05
-max_traj_len: 300
-minibatch_size: 2048
+ik_baseline: None
+input_profile: full
+learn_gains: False
mirror: True
-n_itr: 20000
-name: model
no_delta: True
-num_procs: 64
-num_steps: 187
-previous: None
recurrent: False
-redis_address: None
reward: 5k_speed_reward
simrate: 60
-state_est: True
-traj: walking
-use_gae: False
-viz_port: 8097
+traj: None
diff --git a/trained_models/nodelta_neutral_StateEst_symmetry_speed0-3_freq1-2/experiment.pkl b/trained_models/nodelta_neutral_StateEst_symmetry_speed0-3_freq1-2/experiment.pkl
index 74d2df1d..2fb18138 100644
Binary files a/trained_models/nodelta_neutral_StateEst_symmetry_speed0-3_freq1-2/experiment.pkl and b/trained_models/nodelta_neutral_StateEst_symmetry_speed0-3_freq1-2/experiment.pkl differ
diff --git a/util.py b/util.py
deleted file mode 100644
index ccc8c817..00000000
--- a/util.py
+++ /dev/null
@@ -1,752 +0,0 @@
-import torch
-import hashlib, os, pickle
-from collections import OrderedDict
-import sys, time
-from cassie.quaternion_function import *
-
-from tkinter import *
-import multiprocessing as mp
-from cassie.phase_function import LivePlot
-import matplotlib.pyplot as plt
-
-import tty
-import termios
-import select
-import numpy as np
-from cassie import CassieEnv, CassieMinEnv, CassiePlayground, CassieStandingEnv, CassieEnv_noaccel_footdist_omniscient, CassieEnv_footdist, CassieEnv_noaccel_footdist, CassieEnv_novel_footdist, CassieEnv_mininput
-from cassie.cassiemujoco import CassieSim
-
-class color:
- BOLD = '\033[1m\033[48m'
- END = '\033[0m'
- ORANGE = '\033[38;5;202m'
- BLACK = '\033[38;5;240m'
-
-
-def print_logo(subtitle="", option=2):
- print()
- print(color.BOLD + color.ORANGE + " .8. " + color.BLACK + " 8 888888888o " + color.ORANGE + "8 8888888888 `8.`8888. ,8' ")
- print(color.BOLD + color.ORANGE + " .888. " + color.BLACK + " 8 8888 `88. " + color.ORANGE + "8 8888 `8.`8888. ,8' ")
- print(color.BOLD + color.ORANGE + " :88888. " + color.BLACK + " 8 8888 `88 " + color.ORANGE + "8 8888 `8.`8888. ,8' ")
- print(color.BOLD + color.ORANGE + " . `88888. " + color.BLACK + " 8 8888 ,88 " + color.ORANGE + "8 8888 `8.`8888.,8' ")
- print(color.BOLD + color.ORANGE + " .8. `88888. " + color.BLACK + " 8 8888. ,88' " + color.ORANGE + "8 888888888888 `8.`88888' ")
- print(color.BOLD + color.ORANGE + " .8`8. `88888. " + color.BLACK + " 8 888888888P' " + color.ORANGE + "8 8888 .88.`8888. ")
- print(color.BOLD + color.ORANGE + " .8' `8. `88888. " + color.BLACK + " 8 8888 " + color.ORANGE + "8 8888 .8'`8.`8888. ")
- print(color.BOLD + color.ORANGE + " .8' `8. `88888. " + color.BLACK + " 8 8888 " + color.ORANGE + "8 8888 .8' `8.`8888. ")
- print(color.BOLD + color.ORANGE + " .888888888. `88888. " + color.BLACK + " 8 8888 " + color.ORANGE + "8 8888 .8' `8.`8888. ")
- print(color.BOLD + color.ORANGE + ".8' `8. `88888." + color.BLACK + " 8 8888 " + color.ORANGE + "8 888888888888 .8' `8.`8888. " + color.END)
- print("\n")
- print(subtitle)
- print("\n")
-
-def env_factory(path, traj="walking", simrate=50, phase_based=False, clock_based=True, state_est=True, dynamics_randomization=True, mirror=False, no_delta=False, ik_baseline=False, learn_gains=False, reward=None, history=0, fixed_speed=None, **kwargs):
- from functools import partial
-
- """
- Returns an *uninstantiated* environment constructor.
-
- Since environments containing cpointers (e.g. Mujoco envs) can't be serialized,
- this allows us to pass their constructors to Ray remote functions instead
- (since the gym registry isn't shared across ray subprocesses we can't simply
- pass gym.make() either)
-
- Note: env.unwrapped.spec is never set, if that matters for some reason.
- """
-
-
- # Custom Cassie Environment
- if path in ['Cassie-v0', 'CassieMin-v0', 'CassiePlayground-v0', 'CassieStandingEnv-v0', 'CassieNoaccelFootDistOmniscient', 'CassieFootDist', 'CassieNoaccelFootDist', 'CassieNoaccelFootDistNojoint', 'CassieNovelFootDist', 'CassieMinInput']:
- from cassie import CassieEnv, CassieMinEnv, CassiePlayground, CassieStandingEnv, CassieEnv_noaccel_footdist_omniscient, CassieEnv_footdist, CassieEnv_noaccel_footdist, CassieEnv_noaccel_footdist_nojoint, CassieEnv_novel_footdist, CassieEnv_mininput
-
- if path == 'Cassie-v0':
- # env_fn = partial(CassieEnv, traj=traj, clock_based=clock_based, state_est=state_est, dynamics_randomization=dynamics_randomization, no_delta=no_delta, reward=reward, history=history)
- env_fn = partial(CassieEnv, traj=traj, simrate=simrate, phase_based=phase_based, clock_based=clock_based, state_est=state_est, dynamics_randomization=dynamics_randomization, no_delta=no_delta, learn_gains=learn_gains, ik_baseline=ik_baseline, reward=reward, history=history, fixed_speed=fixed_speed)
- elif path == 'CassieMin-v0':
- env_fn = partial(CassieMinEnv, traj=traj, simrate=simrate, phase_based=phase_based, clock_based=clock_based, state_est=state_est, dynamics_randomization=dynamics_randomization, no_delta=no_delta, learn_gains=learn_gains, ik_baseline=ik_baseline, reward=reward, history=history, fixed_speed=fixed_speed)
- elif path == 'CassiePlayground-v0':
- env_fn = partial(CassiePlayground, traj=traj, simrate=simrate, clock_based=clock_based, state_est=state_est, dynamics_randomization=dynamics_randomization, no_delta=no_delta, reward=reward, history=history)
- elif path == 'CassieStandingEnv-v0':
- env_fn = partial(CassieStandingEnv, simrate=simrate, state_est=state_est)
- elif path == 'CassieNoaccelFootDistOmniscient':
- env_fn = partial(CassieEnv_noaccel_footdist_omniscient, simrate=simrate, traj=traj, clock_based=clock_based, state_est=state_est, dynamics_randomization=True, no_delta=no_delta, reward=reward, history=history)
- elif path == 'CassieFootDist':
- env_fn = partial(CassieEnv_footdist, traj=traj, simrate=simrate, clock_based=clock_based, state_est=state_est, dynamics_randomization=dynamics_randomization, no_delta=no_delta, reward=reward, history=history)
- elif path == 'CassieNoaccelFootDist':
- env_fn = partial(CassieEnv_noaccel_footdist, traj=traj, simrate=simrate, clock_based=clock_based, state_est=state_est, dynamics_randomization=dynamics_randomization, no_delta=no_delta, reward=reward, history=history)
- elif path == "CassieNoaccelFootDistNojoint":
- env_fn = partial(CassieEnv_noaccel_footdist_nojoint, traj=traj, simrate=simrate, clock_based=clock_based, state_est=state_est, dynamics_randomization=dynamics_randomization, no_delta=no_delta, reward=reward, history=history)
- elif path == "CassieNovelFootDist":
- env_fn = partial(CassieEnv_novel_footdist, traj=traj, simrate=simrate, clock_based=clock_based, state_est=state_est, dynamics_randomization=dynamics_randomization, no_delta=no_delta, reward=reward, history=history)
- elif path == "CassieMinInput":
- env_fn = partial(CassieEnv_mininput, traj=traj, simrate=simrate, clock_based=clock_based, state_est=state_est, dynamics_randomization=dynamics_randomization, no_delta=no_delta, reward=reward, history=history)
-
-
- # TODO for Yesh: make mirrored_obs an attribute of environment, configured based on setup parameters
- if mirror:
- from rl.envs.wrappers import SymmetricEnv
- env_fn = partial(SymmetricEnv, env_fn, mirrored_obs=env_fn().mirrored_obs, mirrored_act=[-5, -6, 7, 8, 9, -0.1, -1, 2, 3, 4])
-
- return env_fn
-
- # OpenAI Gym environment
- else:
- import gym
- spec = gym.envs.registry.spec(path)
- _kwargs = spec._kwargs.copy()
- _kwargs.update(kwargs)
-
- try:
- if callable(spec._entry_point):
- cls = spec._entry_point(**_kwargs)
- else:
- cls = gym.envs.registration.load(spec._entry_point)
- except AttributeError:
- if callable(spec.entry_point):
- cls = spec.entry_point(**_kwargs)
- else:
- cls = gym.envs.registration.load(spec.entry_point)
-
- return partial(cls, **_kwargs)
-
-# Logger stores in trained_models by default
-def create_logger(args):
- from torch.utils.tensorboard import SummaryWriter
- """Use hyperparms to set a directory to output diagnostic files."""
-
- arg_dict = args.__dict__
- assert "seed" in arg_dict, \
- "You must provide a 'seed' key in your command line arguments"
- assert "logdir" in arg_dict, \
- "You must provide a 'logdir' key in your command line arguments."
- assert "env_name" in arg_dict, \
- "You must provide a 'env_name' key in your command line arguments."
-
- # sort the keys so the same hyperparameters will always have the same hash
- arg_dict = OrderedDict(sorted(arg_dict.items(), key=lambda t: t[0]))
-
- # remove seed so it doesn't get hashed, store value for filename
- # same for logging directory
- run_name = arg_dict.pop('run_name')
- seed = str(arg_dict.pop("seed"))
- logdir = str(arg_dict.pop('logdir'))
- env_name = str(arg_dict['env_name'])
-
- # see if this run has a unique name, if so then that is going to be the name of the folder, even if it overrirdes
- if run_name is not None:
- logdir = os.path.join(logdir, env_name)
- output_dir = os.path.join(logdir, run_name)
- else:
- # see if we are resuming a previous run, if we are mark as continued
- if args.previous is not None:
- if args.exchange_reward is not None:
- output_dir = args.previous[0:-1] + "_NEW-" + args.reward
- else:
- print(args.previous[0:-1])
- output_dir = args.previous[0:-1] + '-cont'
- else:
- # get a unique hash for the hyperparameter settings, truncated at 10 chars
- arg_hash = hashlib.md5(str(arg_dict).encode('ascii')).hexdigest()[0:6] + '-seed' + seed
- logdir = os.path.join(logdir, env_name)
- output_dir = os.path.join(logdir, arg_hash)
-
- # create a directory with the hyperparm hash as its name, if it doesn't
- # already exist.
- os.makedirs(output_dir, exist_ok=True)
-
- # Create a file with all the hyperparam settings in human-readable plaintext,
- # also pickle file for resuming training easily
- info_path = os.path.join(output_dir, "experiment.info")
- pkl_path = os.path.join(output_dir, "experiment.pkl")
- with open(pkl_path, 'wb') as file:
- pickle.dump(args, file)
- with open(info_path, 'w') as file:
- for key, val in arg_dict.items():
- file.write("%s: %s" % (key, val))
- file.write('\n')
-
- logger = SummaryWriter(output_dir, flush_secs=0.1) # flush_secs=0.1 actually slows down quite a bit, even on parallelized set ups
- print("Logging to " + color.BOLD + color.ORANGE + str(output_dir) + color.END)
-
- logger.dir = output_dir
- return logger
-
-# #TODO: Add pausing, and window quiting along with other render functionality
-# def eval_policy(policy, args, run_args):
-
-# import tty
-# import termios
-# import select
-# import numpy as np
-# from cassie import CassieEnv, CassiePlayground, CassieStandingEnv, CassieEnv_noaccel_footdist_omniscient, CassieEnv_footdist, CassieEnv_noaccel_footdist
-# from cassie.cassiemujoco import CassieSim
-
-# def isData():
-# return select.select([sys.stdin], [], [], 0) == ([sys.stdin], [], [])
-
-# max_traj_len = args.traj_len
-# visualize = not args.no_viz
-# print("env name: ", run_args.env_name)
-# if run_args.env_name is None:
-# env_name = args.env_name
-# else:
-# env_name = run_args.env_name
-# if env_name == "Cassie-v0":
-# env = CassieEnv(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, phase_based=run_args.phase_based, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
-# elif env_name == "CassiePlayground-v0":
-# env = CassiePlayground(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history, mission=args.mission)
-# elif env_name == "CassieNoaccelFootDistOmniscient":
-# env = CassieEnv_noaccel_footdist_omniscient(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
-# elif env_name == "CassieFootDist":
-# env = CassieEnv_footdist(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
-# elif env_name == "CassieNoaccelFootDist":
-# env = CassieEnv_noaccel_footdist(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
-# else:
-# env = CassieStandingEnv(state_est=run_args.state_est)
-
-# if args.debug:
-# env.debug = True
-
-# if args.terrain is not None and ".npy" in args.terrain:
-# env.sim = CassieSim("cassie_hfield.xml")
-# hfield_data = np.load(os.path.join("./cassie/cassiemujoco/terrains/", args.terrain))
-# env.sim.set_hfield_data(hfield_data.flatten())
-
-# print(env.reward_func)
-
-# if hasattr(policy, 'init_hidden_state'):
-# policy.init_hidden_state()
-
-# old_settings = termios.tcgetattr(sys.stdin)
-
-# orient_add = 0
-
-# slowmo = False
-
-# if visualize:
-# env.render()
-# render_state = True
-# try:
-# tty.setcbreak(sys.stdin.fileno())
-
-# state = env.reset_for_test()
-# done = False
-# timesteps = 0
-# eval_reward = 0
-# speed = 0.0
-
-# env.update_speed(speed)
-
-# while render_state:
-
-# if isData():
-# c = sys.stdin.read(1)
-# if c == 'w':
-# speed += 0.1
-# elif c == 's':
-# speed -= 0.1
-# elif c == 'j':
-# env.phase_add += .1
-# print("Increasing frequency to: {:.1f}".format(env.phase_add))
-# elif c == 'h':
-# env.phase_add -= .1
-# print("Decreasing frequency to: {:.1f}".format(env.phase_add))
-# elif c == 'l':
-# orient_add += .1
-# print("Increasing orient_add to: ", orient_add)
-# elif c == 'k':
-# orient_add -= .1
-# print("Decreasing orient_add to: ", orient_add)
-
-# elif c == 'x':
-# env.swing_duration += .01
-# print("Increasing swing duration to: ", env.swing_duration)
-# elif c == 'z':
-# env.swing_duration -= .01
-# print("Decreasing swing duration to: ", env.swing_duration)
-# elif c == 'v':
-# env.stance_duration += .01
-# print("Increasing stance duration to: ", env.stance_duration)
-# elif c == 'c':
-# env.stance_duration -= .01
-# print("Decreasing stance duration to: ", env.stance_duration)
-
-# elif c == '1':
-# env.stance_mode = "zero"
-# print("Stance mode: ", env.stance_mode)
-# elif c == '2':
-# env.stance_mode = "grounded"
-# print("Stance mode: ", env.stance_mode)
-# elif c == '3':
-# env.stance_mode = "aerial"
-# print("Stance mode: ", env.stance_mode)
-# elif c == 'r':
-# state = env.reset()
-# speed = env.speed
-# print("Resetting environment via env.reset()")
-# elif c == 'p':
-# push = 100
-# push_dir = 2
-# force_arr = np.zeros(6)
-# force_arr[push_dir] = push
-# env.sim.apply_force(force_arr)
-# elif c == 't':
-# slowmo = not slowmo
-# print("Slowmo : ", slowmo)
-
-# env.update_speed(speed)
-# # print(speed)
-# print("speed: ", env.speed)
-
-# if hasattr(env, 'simrate'):
-# start = time.time()
-
-# if (not env.vis.ispaused()):
-# # Update Orientation
-# env.orient_add = orient_add
-# # quaternion = euler2quat(z=orient_add, y=0, x=0)
-# # iquaternion = inverse_quaternion(quaternion)
-
-# # # TODO: Should probably not assume these indices. Should make them not hard coded
-# # if env.state_est:
-# # curr_orient = state[1:5]
-# # curr_transvel = state[15:18]
-# # else:
-# # curr_orient = state[2:6]
-# # curr_transvel = state[20:23]
-
-# # new_orient = quaternion_product(iquaternion, curr_orient)
-
-# # if new_orient[0] < 0:
-# # new_orient = -new_orient
-
-# # new_translationalVelocity = rotate_by_quaternion(curr_transvel, iquaternion)
-
-# # if env.state_est:
-# # state[1:5] = torch.FloatTensor(new_orient)
-# # state[15:18] = torch.FloatTensor(new_translationalVelocity)
-# # # state[0] = 1 # For use with StateEst. Replicate hack that height is always set to one on hardware.
-# # else:
-# # state[2:6] = torch.FloatTensor(new_orient)
-# # state[20:23] = torch.FloatTensor(new_translationalVelocity)
-
-# action = policy.forward(torch.Tensor(state), deterministic=True).detach().numpy()
-
-# state, reward, done, _ = env.step(action)
-
-# # if env.lfoot_vel[2] < -0.6:
-# # print("left foot z vel over 0.6: ", env.lfoot_vel[2])
-# # if env.rfoot_vel[2] < -0.6:
-# # print("right foot z vel over 0.6: ", env.rfoot_vel[2])
-
-# eval_reward += reward
-# timesteps += 1
-# qvel = env.sim.qvel()
-# # print("actual speed: ", np.linalg.norm(qvel[0:2]))
-# # print("commanded speed: ", env.speed)
-
-# if args.no_viz:
-# yaw = quaternion2euler(new_orient)[2]
-# print("stp = {} yaw = {:.2f} spd = {} ep_r = {:.2f} stp_r = {:.2f}".format(timesteps, yaw, speed, eval_reward, reward))
-
-# if visualize:
-# render_state = env.render()
-# if hasattr(env, 'simrate'):
-# # assume 40hz
-# end = time.time()
-# delaytime = max(0, 1000 / 40000 - (end-start))
-# if slowmo:
-# while(time.time() - end < delaytime*10):
-# env.render()
-# time.sleep(delaytime)
-# else:
-# time.sleep(delaytime)
-
-# print("Eval reward: ", eval_reward)
-
-# finally:
-# termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
-
-class EvalProcessClass():
- def __init__(self, args, run_args):
-
- if run_args.phase_based and not args.no_viz:
-
- self.plot_pipe, plotter_pipe = mp.Pipe()
- self.plotter = LivePlot()
- self.plot_process = mp.Process(target=self.plotter, args=(plotter_pipe,), daemon=True)
- self.plot_process.start()
-
- #TODO: Add pausing, and window quiting along with other render functionality
- def eval_policy(self, policy, args, run_args):
-
- def print_input_update(e):
- print(f"\n\nstance dur.: {e.stance_duration:.2f}\t swing dur.: {e.swing_duration:.2f}\t stance mode: {e.stance_mode}\n")
-
- if run_args.phase_based and not args.no_viz:
- send = self.plot_pipe.send
-
- def isData():
- return select.select([sys.stdin], [], [], 0) == ([sys.stdin], [], [])
-
- if args.debug:
- args.stats = False
-
- if args.reward is None:
- args.reward = run_args.reward
-
- max_traj_len = args.traj_len
- visualize = not args.no_viz
- print("env name: ", run_args.env_name)
- if run_args.env_name is None:
- env_name = args.env_name
- else:
- env_name = run_args.env_name
- if env_name == "Cassie-v0":
- env = CassieEnv(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, phase_based=run_args.phase_based, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
- elif env_name == "CassieMin-v0":
- env = CassieMinEnv(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, phase_based=run_args.phase_based, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
- elif env_name == "CassiePlayground-v0":
- env = CassiePlayground(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history, mission=args.mission)
- elif env_name == "CassieNoaccelFootDistOmniscient":
- env = CassieEnv_noaccel_footdist_omniscient(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
- elif env_name == "CassieFootDist":
- env = CassieEnv_footdist(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
- elif env_name == "CassieNoaccelFootDist":
- env = CassieEnv_noaccel_footdist(traj=run_args.traj, simrate=run_args.simrate, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
- elif env_name == "CassieNoaccelFootDistNojoint":
- env = CassieEnv_noaccel_footdist_nojoint(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
- elif env_name == "CassieNovelFootDist":
- env = CassieEnv_novel_footdist(traj=run_args.traj, simrate=run_args.simrate, clock_based=run_args.clock_based, state_est=run_args.state_est, dynamics_randomization=run_args.dyn_random, no_delta=run_args.no_delta, reward=args.reward, history=run_args.history)
- elif env_name == "CassieMinInput":
- env = CassieEnv_mininput(traj=run_args.traj, simrate=run_args.simrate, clock_based=run_args.clock_based, state_est=run_args.state_est, dynamics_randomization=run_args.dyn_random, no_delta=run_args.no_delta, reward=args.reward, history=run_args.history)
- else:
- env = CassieStandingEnv(state_est=run_args.state_est)
-
- if args.debug:
- env.debug = True
-
- if args.terrain is not None and ".npy" in args.terrain:
- env.sim = CassieSim("cassie_hfield.xml")
- hfield_data = np.load(os.path.join("./cassie/cassiemujoco/terrains/", args.terrain))
- env.sim.set_hfield_data(hfield_data.flatten())
-
- print(env.reward_func)
- print()
-
- if hasattr(policy, 'init_hidden_state'):
- policy.init_hidden_state()
-
- old_settings = termios.tcgetattr(sys.stdin)
-
- orient_add = 0
-
- slowmo = False
-
- if visualize:
- env.render()
- render_state = True
- try:
- tty.setcbreak(sys.stdin.fileno())
-
- state = env.reset_for_test()
- done = False
- timesteps = 0
- eval_reward = 0
- speed = 0.0
-
- env.update_speed(speed)
-
- while render_state:
-
- if isData():
- c = sys.stdin.read(1)
- if c == 'w':
- speed += 0.1
- elif c == 's':
- speed -= 0.1
- elif c == 'j':
- env.phase_add += .1
- # print("Increasing frequency to: {:.1f}".format(env.phase_add))
- elif c == 'h':
- env.phase_add -= .1
- # print("Decreasing frequency to: {:.1f}".format(env.phase_add))
- elif c == 'l':
- orient_add -= .1
- # print("Increasing orient_add to: ", orient_add)
- elif c == 'k':
- orient_add += .1
- # print("Decreasing orient_add to: ", orient_add)
-
- elif c == 'x':
- env.swing_duration += .01
- print_input_update(env)
- elif c == 'z':
- env.swing_duration -= .01
- print_input_update(env)
- elif c == 'v':
- env.stance_duration += .01
- print_input_update(env)
- elif c == 'c':
- env.stance_duration -= .01
- print_input_update(env)
-
- elif c == '1':
- env.stance_mode = "zero"
- print_input_update(env)
- elif c == '2':
- env.stance_mode = "grounded"
- print_input_update(env)
- elif c == '3':
- env.stance_mode = "aerial"
-
- elif c == 'r':
- state = env.reset()
- speed = env.speed
- if hasattr(policy, 'init_hidden_state'):
- policy.init_hidden_state()
- print("Resetting environment via env.reset()")
- elif c == 'p':
- push = 100
- push_dir = 2
- force_arr = np.zeros(6)
- force_arr[push_dir] = push
- env.sim.apply_force(force_arr)
- elif c == 't':
- slowmo = not slowmo
- print("Slowmo : \n", slowmo)
-
- env.update_speed(speed)
- # print(speed)
-
- if env.phase_based and visualize:
- send((env.swing_duration, env.stance_duration, env.strict_relaxer, env.stance_mode, env.have_incentive))
-
- if args.stats:
- print(f"act spd: {env.sim.qvel()[0]:.2f} cmd speed: {env.speed:.2f} phase add: {env.phase_add:.2f} orient add: {orient_add:.2f}", end="\r")
- # print(f"act spd: {env.sim.qvel()[0]:.2f}\t cmd speed: {env.speed:.2f}\t phase add: {env.phase_add:.2f}\t orient add: {orient_add}", end="\r")
-
- if hasattr(env, 'simrate'):
- start = time.time()
-
- if (not env.vis.ispaused()):
- # Update Orientation
- env.orient_add = orient_add
- # quaternion = euler2quat(z=orient_add, y=0, x=0)
- # iquaternion = inverse_quaternion(quaternion)
-
- # # TODO: Should probably not assume these indices. Should make them not hard coded
- # if env.state_est:
- # curr_orient = state[1:5]
- # curr_transvel = state[15:18]
- # else:
- # curr_orient = state[2:6]
- # curr_transvel = state[20:23]
-
- # new_orient = quaternion_product(iquaternion, curr_orient)
-
- # if new_orient[0] < 0:
- # new_orient = -new_orient
-
- # new_translationalVelocity = rotate_by_quaternion(curr_transvel, iquaternion)
-
- # if env.state_est:
- # state[1:5] = torch.FloatTensor(new_orient)
- # state[15:18] = torch.FloatTensor(new_translationalVelocity)
- # # state[0] = 1 # For use with StateEst. Replicate hack that height is always set to one on hardware.
- # else:
- # state[2:6] = torch.FloatTensor(new_orient)
- # state[20:23] = torch.FloatTensor(new_translationalVelocity)
-
- action = policy.forward(torch.Tensor(state), deterministic=True).detach().numpy()
-
- state, reward, done, _ = env.step(action)
-
- # if env.lfoot_vel[2] < -0.6:
- # print("left foot z vel over 0.6: ", env.lfoot_vel[2])
- # if env.rfoot_vel[2] < -0.6:
- # print("right foot z vel over 0.6: ", env.rfoot_vel[2])
-
- eval_reward += reward
- timesteps += 1
- qvel = env.sim.qvel()
- # print("actual speed: ", np.linalg.norm(qvel[0:2]))
- # print("commanded speed: ", env.speed)
-
- if args.no_viz:
- yaw = quaternion2euler(new_orient)[2]
- print("stp = {} yaw = {:.2f} spd = {} ep_r = {:.2f} stp_r = {:.2f}".format(timesteps, yaw, speed, eval_reward, reward))
-
- if visualize:
- render_state = env.render()
- if hasattr(env, 'simrate'):
- # assume 40hz
- end = time.time()
- delaytime = max(0, 1000 / 40000 - (end-start))
- if slowmo:
- while(time.time() - end < delaytime*10):
- env.render()
- time.sleep(delaytime)
- else:
- time.sleep(delaytime)
-
- print("Eval reward: ", eval_reward)
-
- finally:
- termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
-
-def collect_data(policy, args, run_args):
- wait_steps = 0
- num_cycles = 35
- speed = 0.0
-
- if args.reward is None:
- args.reward = run_args.reward
-
- if run_args.env_name is None:
- env_name = args.env_name
- else:
- env_name = run_args.env_name
- if env_name == "Cassie-v0":
- env = CassieEnv(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, phase_based=run_args.phase_based, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
- elif env_name == "CassiePlayground-v0":
- env = CassiePlayground(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history, mission=args.mission)
- elif env_name == "CassieNoaccelFootDistOmniscient":
- env = CassieEnv_noaccel_footdist_omniscient(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
- elif env_name == "CassieFootDist":
- env = CassieEnv_footdist(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
- elif env_name == "CassieNoaccelFootDist":
- env = CassieEnv_noaccel_footdist(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
- elif env_name == "CassieNoaccelFootDistNojoint":
- env = CassieEnv_noaccel_footdist_nojoint(traj=run_args.traj, state_est=run_args.state_est, no_delta=run_args.no_delta, dynamics_randomization=run_args.dyn_random, clock_based=run_args.clock_based, reward=args.reward, history=run_args.history)
- else:
- env = CassieStandingEnv(state_est=run_args.state_est)
-
- if hasattr(policy, 'init_hidden_state'):
- policy.init_hidden_state()
-
- state = torch.Tensor(env.reset_for_test())
- env.update_speed(speed)
- env.render()
-
- time_log, speed_log, grf_log = [], [], []
-
- # print("iros: ", iros_env.simrate, iros_env.phaselen)
- # print("aslip: ", aslip_env.simrate, aslip_env.phaselen)
-
- with torch.no_grad():
-
- # Run few cycles to stabilize (do separate incase two envs have diff phaselens)
- for i in range(wait_steps):
- action = policy.forward(torch.Tensor(state), deterministic=True).detach().numpy()
- state, reward, done, _ = env.step(action)
- env.render()
- print(f"act spd: {env.sim.qvel()[0]:.2f}\t cmd speed: {env.speed:.2f}")
- # curr_qpos = aslip_env.sim.qpos()
- # print("curr height: ", curr_qpos[2])
-
- # Collect actual data (avg foot force over simrate)
- # start_time = time.time()
- # for i in range(math.floor(num_cycles*env.phaselen)+1):
- # action = policy.forward(torch.Tensor(state), deterministic=True).detach().numpy()
- # state, reward, done, _ = env.step(action)
- # speed_log.append([env.speed, env.sim.qvel()[0]])
-
- # # grf_log.append(np.concatenate([[time.time()-start_time],env.sim.get_foot_forces()]))
- # grf_log.append([time.time()-start_time, env.l_foot_frc, env.r_foot_frc])
- # env.render()
- # print(f"act spd: {env.sim.qvel()[0]:.2f}\t cmd speed: {env.speed:.2f}")
- # # curr_qpos = aslip_env.sim.qpos()
- # # print("curr height: ", curr_qpos[2])
-
- # Collect actual data (foot force each sim step)
- print("Start actual data")
- start_time = time.time()
- for i in range(num_cycles):
- for j in range(math.floor(env.phaselen)+1):
- action = policy.forward(torch.Tensor(state), deterministic=True).detach().numpy()
- for k in range(env.simrate):
- env.step_simulation(action)
- time_log.append(time.time()-start_time)
- speed_log.append([env.speed, env.sim.qvel()[0]])
- grf_log.append(env.sim.get_foot_forces())
-
- env.time += 1
- env.phase += env.phase_add
- if env.phase > env.phaselen:
- env.phase = 0
- env.counter += 1
- state = env.get_full_state()
- env.speed = i * 0.1
- env.render()
-
- time_log, speed_log, grf_log = map(np.array, (time_log, speed_log, grf_log))
- print(speed_log.shape)
- print(grf_log.shape)
-
- ### Process the data
-
- # average data and get std dev
- mean_speed = np.mean(speed_log, axis=0)
- stddev_speed = np.mean(speed_log, axis=0)
- mean_grfs = np.mean(grf_log[:, 1:], axis=0)
- stddev_grfs = np.std(grf_log[:, 1:], axis=0)
- print(mean_speed)
- print(stddev_speed)
- print(mean_grfs)
- print(stddev_grfs)
-
- ### Save the data
- output = {
- "time": time_log,
- "speed": speed_log,
- "grfs": grf_log
- }
- with open(os.path.join(args.path, "collect_data.pkl"), "wb") as f:
- pickle.dump(output, f)
-
- with open(os.path.join(args.path, "collect_data.pkl"), "rb") as f:
- data = pickle.load(f)
-
- time_data = data["time"]
- speed_data = data["speed"]
- grf_data = data["grfs"]
-
- ### Plot the data
- fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True)
- axs[0].plot(time_data, speed_data, label="commanded speed")
- axs[0].plot(time_data, speed_data, label="actual speed")
- axs[0].set_title("Speed")
- axs[0].set_xlabel("Time (Seconds)")
- axs[0].set_ylabel("m/s")
- axs[1].plot(time_data, grf_log[:,0], label="sim left foot")
- axs[1].plot(time_data, grf_log[:,1], label="sim right foot")
- axs[1].set_title("GRFs")
- axs[1].set_xlabel("Time (Seconds)")
- axs[1].set_ylabel("Newtons")
- plt.legend(loc="upper right")
- plt.show()
-
-# Rule for curriculum learning is that env observation space should be the same (so attributes like env.clock_based or env.state_est shouldn't be different and are forced to be same here)
-# deal with loading hyperparameters of previous run continuation
-def parse_previous(args):
- if args.previous is not None:
- run_args = pickle.load(open(args.previous + "experiment.pkl", "rb"))
- args.env_name = run_args.env_name
- args.traj = run_args.traj
- args.phase_based = run_args.phase_based
- args.clock_based = run_args.clock_based
- args.state_est = run_args.state_est
- args.no_delta = run_args.no_delta
- args.recurrent = run_args.recurrent
- args.learn_gains = run_args.learn_gains
- args.ik_baseline = run_args.ik_baseline,
- if args.env_name == "CassiePlayground-v0":
- args.reward = "command"
- args.run_name = run_args.run_name + "-playground"
- if args.exchange_reward != None:
- args.reward = args.exchange_reward
- args.run_name = run_args.run_name + "_NEW-" + args.reward
- else:
- args.reward = run_args.reward
- args.run_name = run_args.run_name + "--cont"
- return args
diff --git a/util/env.py b/util/env.py
new file mode 100644
index 00000000..32c0c053
--- /dev/null
+++ b/util/env.py
@@ -0,0 +1,72 @@
+import os
+import time
+import torch
+import numpy as np
+
+from cassie import CassieEnv, CassieTrajEnv, CassiePlayground, CassieStandingEnv
+
+def env_factory(path, command_profile="clock", input_profile="full", simrate=50, dynamics_randomization=True, mirror=False, learn_gains=False, reward=None, history=0, no_delta=True, traj=None, ik_baseline=False, **kwargs):
+ from functools import partial
+
+ """
+ Returns an *uninstantiated* environment constructor.
+
+ Since environments containing cpointers (e.g. Mujoco envs) can't be serialized,
+ this allows us to pass their constructors to Ray remote functions instead
+ (since the gym registry isn't shared across ray subprocesses we can't simply
+ pass gym.make() either)
+
+ Note: env.unwrapped.spec is never set, if that matters for some reason.
+ """
+
+ # Custom Cassie Environment
+ if path in ['Cassie-v0', 'CassieTraj-v0', 'CassiePlayground-v0', 'CassieStandingEnv-v0']:
+
+ if path == 'Cassie-v0':
+ env_fn = partial(CassieEnv, command_profile=command_profile, input_profile=input_profile, simrate=simrate, dynamics_randomization=dynamics_randomization, learn_gains=learn_gains, reward=reward, history=history)
+ elif path == 'CassieTraj-v0':
+ env_fn = partial(CassieTrajEnv, traj=traj, command_profile=command_profile, input_profile=input_profile, simrate=simrate, dynamics_randomization=dynamics_randomization, no_delta=no_delta, learn_gains=learn_gains, ik_baseline=ik_baseline, reward=reward, history=history)
+ elif path == 'CassiePlayground-v0':
+ env_fn = partial(CassiePlayground, command_profile=command_profile, input_profile=input_profile, simrate=simrate, dynamics_randomization=dynamics_randomization, learn_gains=learn_gains, reward=reward, history=history)
+ elif path == 'CassieStandingEnv-v0':
+ env_fn = partial(CassieStandingEnv, command_profile=command_profile, input_profile=input_profile, simrate=simrate, dynamics_randomization=dynamics_randomization, learn_gains=learn_gains, reward=reward, history=history)
+
+ if mirror:
+ from rl.envs.wrappers import SymmetricEnv
+ env_fn = partial(SymmetricEnv, env_fn, mirrored_obs=env_fn().mirrored_obs, mirrored_act=env_fn().mirrored_acts)
+
+ print()
+ print("Environment: {}".format(path))
+ print(" ├ reward: {}".format(reward))
+ print(" ├ input prof: {}".format(input_profile))
+ print(" ├ cmd prof: {}".format(command_profile))
+ print(" ├ learn gains: {}".format(learn_gains))
+ print(" ├ dyn_random: {}".format(dynamics_randomization))
+ print(" ├ mirror: {}".format(mirror))
+ if path == "CassieTraj-v0":
+ print(" ├ traj: {}".format(traj))
+ print(" ├ ik baseline: {}".format(ik_baseline))
+ print(" ├ no_delta: {}".format(no_delta))
+ print(" └ obs_dim: {}".format(env_fn().observation_space.shape[0]))
+
+ return env_fn
+
+ # OpenAI Gym environment
+ else:
+ import gym
+ spec = gym.envs.registry.spec(path)
+ _kwargs = spec._kwargs.copy()
+ _kwargs.update(kwargs)
+
+ try:
+ if callable(spec._entry_point):
+ cls = spec._entry_point(**_kwargs)
+ else:
+ cls = gym.envs.registration.load(spec._entry_point)
+ except AttributeError:
+ if callable(spec.entry_point):
+ cls = spec.entry_point(**_kwargs)
+ else:
+ cls = gym.envs.registration.load(spec.entry_point)
+
+ return partial(cls, **_kwargs)
diff --git a/util/eval.py b/util/eval.py
new file mode 100644
index 00000000..42f84aaf
--- /dev/null
+++ b/util/eval.py
@@ -0,0 +1,206 @@
+import torch
+import os
+import sys, time
+
+from tkinter import *
+import multiprocessing as mp
+from cassie.phase_function import LivePlot
+import matplotlib.pyplot as plt
+
+import tty
+import termios
+import select
+import numpy as np
+
+from util.env import env_factory
+
+class EvalProcessClass():
+ def __init__(self, args, run_args):
+
+ if run_args.command_profile == "phase" and not args.no_viz:
+ self.live_plot = True
+ self.plot_pipe, plotter_pipe = mp.Pipe()
+ self.plotter = LivePlot()
+ self.plot_process = mp.Process(target=self.plotter, args=(plotter_pipe,), daemon=True)
+ self.plot_process.start()
+ else:
+ self.live_plot = False
+
+ #TODO: Add pausing, and window quiting along with other render functionality
+ def eval_policy(self, policy, args, run_args):
+
+ def print_input_update(e):
+ print(f"\n\nstance dur.: {e.stance_duration:.2f}\t swing dur.: {e.swing_duration:.2f}\t stance mode: {e.stance_mode}\n")
+
+ if self.live_plot:
+ send = self.plot_pipe.send
+
+ def isData():
+ return select.select([sys.stdin], [], [], 0) == ([sys.stdin], [], [])
+
+ if args.debug:
+ args.stats = False
+
+ if args.reward is None:
+ args.reward = run_args.reward
+
+ max_traj_len = args.traj_len
+ visualize = not args.no_viz
+ print("env name: ", run_args.env_name)
+ if run_args.env_name is None:
+ env_name = args.env_name
+ else:
+ env_name = run_args.env_name
+
+ env = env_factory(
+ args.env_name,
+ command_profile=run_args.command_profile,
+ input_profile=run_args.input_profile,
+ simrate=run_args.simrate,
+ dynamics_randomization=run_args.dyn_random,
+ mirror=run_args.mirror,
+ learn_gains=run_args.learn_gains,
+ reward=run_args.reward,
+ history=run_args.history,
+ no_delta=run_args.no_delta,
+ traj=run_args.traj,
+ ik_baseline=run_args.ik_baseline
+ )()
+
+ if args.debug:
+ env.debug = True
+
+ if args.terrain is not None and ".npy" in args.terrain:
+ env.sim = CassieSim("cassie_hfield.xml")
+ hfield_data = np.load(os.path.join("./cassie/cassiemujoco/terrains/", args.terrain))
+ env.sim.set_hfield_data(hfield_data.flatten())
+
+ print(env.reward_func)
+ print()
+
+ if hasattr(policy, 'init_hidden_state'):
+ policy.init_hidden_state()
+
+ old_settings = termios.tcgetattr(sys.stdin)
+
+ orient_add = 0
+
+ slowmo = False
+
+ if visualize:
+ env.render()
+ render_state = True
+ try:
+ tty.setcbreak(sys.stdin.fileno())
+
+ state = env.reset_for_test()
+ done = False
+ timesteps = 0
+ eval_reward = 0
+ speed = 0.0
+ side_speed = 0.0
+
+ env.update_speed(speed, side_speed)
+
+ while render_state:
+
+ if isData():
+ c = sys.stdin.read(1)
+ if c == 'w':
+ speed += 0.1
+ elif c == 's':
+ speed -= 0.1
+ elif c == 'd':
+ side_speed += 0.02
+ elif c == 'a':
+ side_speed -= 0.
+ elif c == 'j':
+ env.phase_add += .1
+ # print("Increasing frequency to: {:.1f}".format(env.phase_add))
+ elif c == 'h':
+ env.phase_add -= .1
+ # print("Decreasing frequency to: {:.1f}".format(env.phase_add))
+ elif c == 'l':
+ orient_add -= .1
+ # print("Increasing orient_add to: ", orient_add)
+ elif c == 'k':
+ orient_add += .1
+ # print("Decreasing orient_add to: ", orient_add)
+
+ elif c == 'x':
+ env.swing_duration += .01
+ print_input_update(env)
+ elif c == 'z':
+ env.swing_duration -= .01
+ print_input_update(env)
+ elif c == 'v':
+ env.stance_duration += .01
+ print_input_update(env)
+ elif c == 'c':
+ env.stance_duration -= .01
+ print_input_update(env)
+
+ elif c == '1':
+ env.stance_mode = "zero"
+ print_input_update(env)
+ elif c == '2':
+ env.stance_mode = "grounded"
+ print_input_update(env)
+ elif c == '3':
+ env.stance_mode = "aerial"
+
+ elif c == 'r':
+ state = env.reset()
+ speed = env.speed
+ if hasattr(policy, 'init_hidden_state'):
+ policy.init_hidden_state()
+ print("Resetting environment via env.reset()")
+ elif c == 'p':
+ push = 100
+ push_dir = 2
+ force_arr = np.zeros(6)
+ force_arr[push_dir] = push
+ env.sim.apply_force(force_arr)
+ elif c == 't':
+ slowmo = not slowmo
+ print("Slowmo : \n", slowmo)
+
+ env.update_speed(speed, side_speed)
+ # print(speed)
+
+ if self.live_plot:
+ send((env.swing_duration, env.stance_duration, env.strict_relaxer, env.stance_mode, env.have_incentive))
+
+ if args.stats:
+ print(f"act spd: {env.sim.qvel()[0]:+.2f} cmd speed: {env.speed:+.2f} cmd_sd_spd: {env.side_speed:+.2f} phase add: {env.phase_add:.2f} orient add: {orient_add:+.2f}", end="\r")
+
+ if hasattr(env, 'simrate'):
+ start = time.time()
+
+ if (not env.vis.ispaused()):
+ # Update Orientation
+ env.orient_add = orient_add
+
+ action = policy.forward(torch.Tensor(state), deterministic=True).detach().numpy()
+
+ state, reward, done, _ = env.step(action)
+
+ eval_reward += reward
+ timesteps += 1
+
+ if visualize:
+ render_state = env.render()
+ if hasattr(env, 'simrate'):
+ end = time.time()
+ delaytime = max(0, env.simrate / 2000 - (end-start))
+ if slowmo:
+ while(time.time() - end < delaytime*10):
+ env.render()
+ time.sleep(delaytime)
+ else:
+ time.sleep(delaytime)
+
+ print("Eval reward: ", eval_reward)
+
+ finally:
+ termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
\ No newline at end of file
diff --git a/util/log.py b/util/log.py
new file mode 100644
index 00000000..e4285e1e
--- /dev/null
+++ b/util/log.py
@@ -0,0 +1,91 @@
+from collections import OrderedDict
+import hashlib, os, pickle
+
+class color:
+ BOLD = '\033[1m\033[48m'
+ END = '\033[0m'
+ ORANGE = '\033[38;5;202m'
+ BLACK = '\033[38;5;240m'
+
+# Logger stores in trained_models by default
+def create_logger(args):
+ from torch.utils.tensorboard import SummaryWriter
+ """Use hyperparms to set a directory to output diagnostic files."""
+
+ arg_dict = args.__dict__
+ assert "seed" in arg_dict, \
+ "You must provide a 'seed' key in your command line arguments"
+ assert "logdir" in arg_dict, \
+ "You must provide a 'logdir' key in your command line arguments."
+ assert "env_name" in arg_dict, \
+ "You must provide a 'env_name' key in your command line arguments."
+
+ # sort the keys so the same hyperparameters will always have the same hash
+ arg_dict = OrderedDict(sorted(arg_dict.items(), key=lambda t: t[0]))
+
+ # remove seed so it doesn't get hashed, store value for filename
+ # same for logging directory
+ run_name = arg_dict.pop('run_name')
+ seed = str(arg_dict.pop("seed"))
+ logdir = str(arg_dict.pop('logdir'))
+ env_name = str(arg_dict['env_name'])
+
+ # see if this run has a unique name, if so then that is going to be the name of the folder, even if it overrirdes
+ if run_name is not None:
+ logdir = os.path.join(logdir, env_name)
+ output_dir = os.path.join(logdir, run_name)
+ else:
+ # see if we are resuming a previous run, if we are mark as continued
+ if args.previous is not None:
+ if args.exchange_reward is not None:
+ output_dir = args.previous[0:-1] + "_NEW-" + args.reward
+ else:
+ print(args.previous[0:-1])
+ output_dir = args.previous[0:-1] + '-cont'
+ else:
+ # get a unique hash for the hyperparameter settings, truncated at 10 chars
+ arg_hash = hashlib.md5(str(arg_dict).encode('ascii')).hexdigest()[0:6] + '-seed' + seed
+ logdir = os.path.join(logdir, env_name)
+ output_dir = os.path.join(logdir, arg_hash)
+
+ # create a directory with the hyperparm hash as its name, if it doesn't
+ # already exist.
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Create a file with all the hyperparam settings in human-readable plaintext,
+ # also pickle file for resuming training easily
+ info_path = os.path.join(output_dir, "experiment.info")
+ pkl_path = os.path.join(output_dir, "experiment.pkl")
+ with open(pkl_path, 'wb') as file:
+ pickle.dump(args, file)
+ with open(info_path, 'w') as file:
+ for key, val in arg_dict.items():
+ file.write("%s: %s" % (key, val))
+ file.write('\n')
+
+ logger = SummaryWriter(output_dir, flush_secs=0.1) # flush_secs=0.1 actually slows down quite a bit, even on parallelized set ups
+ print("Logging to " + color.BOLD + color.ORANGE + str(output_dir) + color.END)
+
+ logger.dir = output_dir
+ return logger
+
+# Rule for curriculum learning is that env observation space should be the same (so attributes like env.clock_based or env.state_est shouldn't be different and are forced to be same here)
+# deal with loading hyperparameters of previous run continuation
+def parse_previous(args):
+ if args.previous is not None:
+ run_args = pickle.load(open(args.previous + "experiment.pkl", "rb"))
+ args.recurrent = run_args.recurrent
+ args.env_name = run_args.env_name
+ args.command_profile = run_args.command_profile
+ args.input_profile = run_args.input_profile
+ args.learn_gains = run_args.learn_gains
+ args.traj = run_args.traj
+ args.no_delta = run_args.no_delta
+ args.ik_baseline = run_args.ik_baseline
+ if args.exchange_reward is not None:
+ args.reward = args.exchange_reward
+ args.run_name = run_args.run_name + "_NEW-" + args.reward
+ else:
+ args.reward = run_args.reward
+ args.run_name = run_args.run_name + "--cont"
+ return args
diff --git a/util/logo.py b/util/logo.py
new file mode 100644
index 00000000..a4ea363d
--- /dev/null
+++ b/util/logo.py
@@ -0,0 +1,22 @@
+class color:
+ BOLD = '\033[1m\033[48m'
+ END = '\033[0m'
+ ORANGE = '\033[38;5;202m'
+ BLACK = '\033[38;5;240m'
+
+
+def print_logo(subtitle="", option=2):
+ print()
+ print(color.BOLD + color.ORANGE + " .8. " + color.BLACK + " 8 888888888o " + color.ORANGE + "8 8888888888 `8.`8888. ,8' ")
+ print(color.BOLD + color.ORANGE + " .888. " + color.BLACK + " 8 8888 `88. " + color.ORANGE + "8 8888 `8.`8888. ,8' ")
+ print(color.BOLD + color.ORANGE + " :88888. " + color.BLACK + " 8 8888 `88 " + color.ORANGE + "8 8888 `8.`8888. ,8' ")
+ print(color.BOLD + color.ORANGE + " . `88888. " + color.BLACK + " 8 8888 ,88 " + color.ORANGE + "8 8888 `8.`8888.,8' ")
+ print(color.BOLD + color.ORANGE + " .8. `88888. " + color.BLACK + " 8 8888. ,88' " + color.ORANGE + "8 888888888888 `8.`88888' ")
+ print(color.BOLD + color.ORANGE + " .8`8. `88888. " + color.BLACK + " 8 888888888P' " + color.ORANGE + "8 8888 .88.`8888. ")
+ print(color.BOLD + color.ORANGE + " .8' `8. `88888. " + color.BLACK + " 8 8888 " + color.ORANGE + "8 8888 .8'`8.`8888. ")
+ print(color.BOLD + color.ORANGE + " .8' `8. `88888. " + color.BLACK + " 8 8888 " + color.ORANGE + "8 8888 .8' `8.`8888. ")
+ print(color.BOLD + color.ORANGE + " .888888888. `88888. " + color.BLACK + " 8 8888 " + color.ORANGE + "8 8888 .8' `8.`8888. ")
+ print(color.BOLD + color.ORANGE + ".8' `8. `88888." + color.BLACK + " 8 8888 " + color.ORANGE + "8 888888888888 .8' `8.`8888. " + color.END)
+ print("\n")
+ print(subtitle)
+ print("\n")