experiments/ddpg_move_to_a_point_irr_dims.py

"""###IMP dummy_seed should always be last in the order in the OrderedDict below!!!
"""
import itertools
from ray import tune
from collections import OrderedDict

num_seeds = 10


var_env_configs = OrderedDict(
    {
        "state_space_dim": [
            2,
            3,
            4,
            6,
            10,
        ],  # , 10, 12, 14] # [2**i for i in range(1,6)]
        "state_space_relevant_indices": [[0, 1]],
        "action_space_relevant_indices": [[0, 1]],
        "delay": [0],  # + [2**i for i in range(4)],
        # 'sequence_length': [1], #, 2, 3, 4],#i for i in range(1,4)]
        # 'reward_density': [0.25], # np.linspace(0.0, 1.0, num=5)
        "make_denser": [True],
        # 'terminal_state_density': [0.25], # np.linspace(0.1, 1.0, num=5)
        "transition_noise": [0],
        "reward_noise": [0],  # , 1, 5, 10, 25] # Std dev. of normal dist.
        "target_point": [[0, 0]],
        "target_radius": [0.5],
        "state_space_max": [10],
        "action_space_max": [1],
        "action_loss_weight": [0.0],
        "time_unit": [1.0],
        "transition_dynamics_order": [1],
        "dummy_seed": [i for i in range(num_seeds)],
    }
)

var_configs = OrderedDict({"env": var_env_configs})

env_config = {
    "env": "RLToy-v0",
    "horizon": 100,
    "env_config": {
        "seed": 0,  # seed
        "state_space_type": "continuous",
        "action_space_type": "continuous",
        "inertia": 1,
        "reward_scale": 1.0,
        "reward_function": "move_to_a_point",
        # 'make_denser': True,
        # "log_level": 'INFO',
        "log_filename": "/tmp/ddpg_mv_pt.log",
    },
}

algorithm = "DDPG"
agent_config = {
    # Learning rate for the critic (Q-function) optimizer.
    "critic_lr": 1e-3,
    # Learning rate for the actor (policy) optimizer.
    "actor_lr": 1e-3,
    # Update the target by \tau * policy + (1-\tau) * target_policy
    "tau": 0.02,
    # How many steps of the model to sample before learning starts.
    "learning_starts": 2000,
    # Postprocess the policy network model output with these hidden layers. If
    # use_state_preprocessor is False, then these will be the *only* hidden
    # layers in the network.
    "actor_hiddens": [32, 32],
    # Postprocess the critic network model output with these hidden layers;
    # again, if use_state_preprocessor is True, then the state will be
    # preprocessed by the model specified with the "model" config option first.
    "critic_hiddens": [32, 32],
    # Apply a state preprocessor with spec given by the "model" config option
    # (like other RL algorithms). This is mostly useful if you have a weird
    # observation shape, like an image. Disabled by default.
    "use_state_preprocessor": False,
    # Hidden layers activation of the postprocessing stage of the policy
    # network
    "actor_hidden_activation": "relu",
    # Hidden layers activation of the postprocessing state of the critic.
    "critic_hidden_activation": "relu",
    # N-step Q learning
    "n_step": 1,
    # Update the target network every `target_network_update_freq` steps.
    "target_network_update_freq": 0,
    # If True, use huber loss instead of squared loss for critic network
    # Conventionally, no need to clip gradients if using a huber loss
    "use_huber": False,
    # Threshold of a huber loss
    "huber_threshold": 1.0,
    # Weights for L2 regularization
    "l2_reg": 1e-6,
    # If not None, clip gradients during optimization at this value
    "grad_norm_clipping": None,
    "buffer_size": 50000,
    # If True prioritized replay buffer will be used.
    "prioritized_replay": False,
    # Alpha parameter for prioritized replay buffer.
    "prioritized_replay_alpha": 0.6,
    # Beta parameter for sampling from prioritized replay buffer.
    "prioritized_replay_beta": 0.4,
    # Time steps over which the beta parameter is annealed.
    # "prioritized_replay_beta_annealing_timesteps": 20000,
    # Final value of beta
    "final_prioritized_replay_beta": 0.4,
    # Epsilon to add to the TD errors when updating priorities.
    "prioritized_replay_eps": 1e-6,
    # "schedule_max_timesteps": 20000,
    "timesteps_per_iteration": 1000,
    # Update the replay buffer with this many samples at once. Note that this
    # setting applies per-worker if num_workers > 1.
    # "rollout_fragment_length": 1,
    "rollout_fragment_length": 1,  # Renamed from sample_batch_size in some Ray version
    "train_batch_size": 32,
    "min_iter_time_s": 0,
}


model_config = {
    "model": {
        # "fcnet_hiddens": [256, 256],
        # "custom_preprocessor": "ohe",
        "custom_options": {},  # extra options to pass to your preprocessor
        # "no_final_linear": False,
        # "vf_share_layers": True,
        # "fcnet_activation": "tanh",
        "use_lstm": False,
    },
}


eval_config = {
    "evaluation_interval": 1,  # I think this means every x training_iterations
    "evaluation_config": {
        "explore": False,
        "exploration_fraction": 0,
        "exploration_final_eps": 0,
        "evaluation_num_episodes": 10,
        "horizon": 100,
        "env_config": {
            "dummy_eval": True,  # hack Used to check if we are in evaluation mode or training mode inside Ray callback on_episode_end() to be able to write eval stats
            "transition_noise": 0
            if "state_space_type" in env_config["env_config"]
            and env_config["env_config"]["state_space_type"] == "discrete"
            else tune.function(lambda a: a.normal(0, 0)),
            "reward_noise": tune.function(lambda a: a.normal(0, 0)),
            "action_loss_weight": 0.0,
        },
    },
}
value_tuples = []
for config_type, config_dict in var_configs.items():
    for key in config_dict:
        assert isinstance(
            var_configs[config_type][key], list
        ), "var_config should be a dict of dicts with lists as the leaf values to allow each configuration option to take multiple possible values"
        value_tuples.append(var_configs[config_type][key])


cartesian_product_configs = list(itertools.product(*value_tuples))
print("Total number of configs. to run:", len(cartesian_product_configs))