Skip to content

Commit

Permalink
Added python reward environment.
Browse files Browse the repository at this point in the history
Configuration strings starting with "@" will not be parsed.
  • Loading branch information
wcaarls committed Oct 25, 2024
1 parent 2f3afc5 commit 2f0b400
Show file tree
Hide file tree
Showing 5 changed files with 332 additions and 0 deletions.
27 changes: 27 additions & 0 deletions addons/python/include/grl/environments/python.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,33 @@ class GymEnvironment : public Environment
void PyObjectToObservation(PyObject *obj, Observation *obs) const;
};

class PythonRewardEnvironment : public Environment
{
public:
TYPEINFO("environment/python", "Python reward environment")

protected:
Environment *env_;
std::string reward_function_str_;

PyObject *reward_function_;

public:
PythonRewardEnvironment() : env_(NULL), reward_function_(NULL) { }

// From Configurable
virtual void request(ConfigurationRequest *config);
virtual void configure(Configuration &config);
virtual void reconfigure(const Configuration &config);

// From Environment
virtual void start(int test, Observation *obs);
virtual double step(const Action &action, Observation *obs, double *reward, int *terminal);

protected:
PyObject *VectorToPyObject(const Vector &v) const;
};

}

#endif // GRL_PYTHON_ENVIRONMENT_H_
114 changes: 114 additions & 0 deletions addons/python/src/python.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
using namespace grl;

REGISTER_CONFIGURABLE(GymEnvironment)
REGISTER_CONFIGURABLE(PythonRewardEnvironment)

void GymEnvironment::request(ConfigurationRequest *config)
{
Expand Down Expand Up @@ -183,3 +184,116 @@ void GymEnvironment::PyObjectToObservation(PyObject *obj, Observation *obs) cons
Py_XDECREF(val);
}
}

// PythonRewardEnvironment ***

void PythonRewardEnvironment::request(ConfigurationRequest *config)
{
config->push_back(CRP("reward", "Python string specifying reward function", reward_function_str_));
config->push_back(CRP("environment", "environment", "Downstream environment", env_));
}

bool printPyList(PyObject* arr) {
PyObject *iter;
PyObject *item;
if ((iter = PyObject_GetIter(arr)) == NULL) {
printf("List is Empty.\n");
return false;
}
while ((item = PyIter_Next(iter)) != NULL) {
if (PyLong_Check(item)) {
long long_item = PyLong_AsLong(item);
printf("%ld\n", long_item);
}
if (PyFloat_Check(item)) {
float float_item = PyFloat_AsDouble(item);
printf("%f\n", float_item);
}
if (PyUnicode_Check(item)) {
const char *unicode_item = PyUnicode_AsUTF8(item);
printf("%s\n", unicode_item);
}
Py_DECREF(item);
}
Py_DECREF(iter);
return true;
}

void PythonRewardEnvironment::configure(Configuration &config)
{
reward_function_str_ = config["reward"].str();
env_ = (Environment*)config["environment"].ptr();

Py_Initialize();

PyObject *filename = PyUnicode_DecodeFSDefault("<internal>");
PyObject *code = Py_CompileStringObject(reward_function_str_.c_str(), filename, Py_file_input, NULL, 0);
if (!code)
{
PyErr_Print();
ERROR(reward_function_str_);
throw Exception("Couldn't compile reward function");
}
Py_DECREF(filename);

PyObject* main = PyImport_AddModule("__main__");
PyObject* globals = PyModule_GetDict(main);
PyObject *ret = PyEval_EvalCode(code, globals, globals);
if (!ret)
{
PyErr_Print();
throw Exception("Couldn't define reward function");
}
Py_DECREF(ret);

reward_function_ = PyDict_GetItemString(globals, "reward");
if (!reward_function_)
{
PyErr_Print();
throw Exception("Reward function 'reward' not defined");
}

Py_DECREF(globals);
}

void PythonRewardEnvironment::reconfigure(const Configuration &config)
{
}


void PythonRewardEnvironment::start(int test, Observation *obs)
{
env_->start(test, obs);
}

double PythonRewardEnvironment::step(const Action &action, Observation *obs, double *reward, int *terminal)
{
double tau = env_->step(action, obs, reward, terminal);

// Do step
PyObject *args = PyTuple_New(2);
PyTuple_SetItem(args, 0, VectorToPyObject(action));
PyTuple_SetItem(args, 1, VectorToPyObject(*obs));

PyObject *ret = PyObject_CallObject(reward_function_, args);
if (!ret)
{
PyErr_Print();
throw Exception("Couldn't evaluate reward function");
}

Py_DECREF(args);
*reward = PyFloat_AsDouble(ret);
Py_DECREF(ret);

return tau;
}

PyObject *PythonRewardEnvironment::VectorToPyObject(const Vector &v) const
{
PyObject *obj = PyTuple_New(v.size());
for (size_t ii=0; ii != v.size(); ++ii)
PyTuple_SetItem(obj, ii, PyFloat_FromDouble(v[ii]));

return obj;
}
5 changes: 5 additions & 0 deletions base/src/configurable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,11 @@ std::string ParameterConfigurator::str() const
{
std::string v = value_, id, expv;

// Expressions starting with @ are not evaluated. For literal
// @ at the beginning of an expression, use @@.
if (!value_.empty() && v[0] == '@')
return value_.substr(1);

// Resolve references
for (size_t ii=0; ii < v.size(); ++ii)
{
Expand Down
99 changes: 99 additions & 0 deletions cfg/pendulum/ac_tc_python.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
reward:
|
@import math
def reward(action, obs):
a = obs[0] - math.pi
return -5*a**2 -0.1*obs[1]**2 -action[0]**2
experiment:
type: experiment/online_learning
runs: 1
trials: 0
steps: 0
rate: 0
test_interval: 10
environment:
type: environment/python
reward: /reward
environment:
type: environment/modeled
model:
type: model/dynamical
control_step: 0.03
integration_steps: 5
dynamics:
type: dynamics/pendulum
task:
type: task/pendulum/swingup
timeout: 2.99
agent:
type: agent/td
policy:
type: mapping/policy/action
sigma: [1]
output_min: experiment/environment/environment/task/action_min
output_max: experiment/environment/environment/task/action_max
projector:
type: projector/tile_coding
tilings: 16
memory: 8388608
resolution: [0.31415, 3.1415]
wrapping: [6.283, 0]
representation:
type: representation/parameterized/linear
init_min: [0]
init_max: [1]
memory: experiment/agent/policy/projector/memory
outputs: experiment/environment/environment/task/action_dims
output_min: experiment/environment/environment/task/action_min
output_max: experiment/environment/environment/task/action_max
predictor:
type: predictor/ac/action
alpha: 0.01
critic:
type: predictor/critic/td
alpha: 0.2
gamma: 0.97
lambda: 0.65
projector: experiment/agent/policy/projector
representation:
type: representation/parameterized/linear
init_min: [0]
init_max: [1]
memory: experiment/agent/predictor/critic/projector/memory
outputs: 1
output_min: []
output_max: []
trace:
type: trace/enumerated/replacing
projector: experiment/agent/policy/projector
representation: experiment/agent/policy/representation
test_agent:
type: agent/fixed
policy:
type: mapping/policy/action
sigma: []
output_min: experiment/environment/environment/task/action_min
output_max: experiment/environment/environment/task/action_max
projector: experiment/agent/policy/projector
representation: experiment/agent/policy/representation
visualizer:
type: visualizer/glut
visualization:
type: visualization/field/mapping
field_dims: [ 0, 1 ]
input_min: experiment/environment/environment/task/observation_min
input_max: experiment/environment/environment/task/observation_max
points: 65536
projection: mean
mapping: experiment/test_agent/policy
output_dim: 0
visualization2:
type: visualization/field/value
field_dims: [0, 1]
input_min: experiment/environment/environment/task/observation_min
input_max: experiment/environment/environment/task/observation_max
points: 65536
projection: mean
output_dim: 0
projector: experiment/agent/predictor/critic/projector
representation: experiment/agent/predictor/critic/representation
87 changes: 87 additions & 0 deletions cfg/pendulum/sarsa_tc_python.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
reward:
|
@import math
def reward(action, obs):
a = obs[0] - math.pi
return -5*a**2 -0.1*obs[1]**2 -action[0]**2
experiment:
type: experiment/online_learning
runs: 1
trials: 0
steps: 0
rate: 0
test_interval: 10
environment:
type: environment/python
reward: /reward
environment:
type: environment/modeled
model:
type: model/dynamical
control_step: 0.03
integration_steps: 5
dynamics:
type: dynamics/pendulum
task:
type: task/pendulum/swingup
timeout: 2.99
randomization: 0
agent:
type: agent/td
policy:
type: mapping/policy/discrete/value/q
discretizer:
type: discretizer/uniform
min: experiment/environment/environment/task/action_min
max: experiment/environment/environment/task/action_max
steps: [3]
projector:
type: projector/tile_coding
tilings: 16
memory: 8388608
safe: 0
resolution: [0.31415, 3.1415, 3]
wrapping: [6.283, 0, 0]
representation:
type: representation/parameterized/linear
init_min: [0]
init_max: [1]
memory: experiment/agent/policy/projector/memory
outputs: 1
output_min: []
output_max: []
sampler:
type: sampler/epsilon_greedy
rand_max: 0
epsilon: 0.05
predictor:
type: predictor/critic/sarsa
alpha: 0.2
gamma: 0.97
lambda: 0.65
projector: experiment/agent/policy/projector
representation: experiment/agent/policy/representation
trace:
type: trace/enumerated/replacing
test_agent:
type: agent/fixed
policy:
type: mapping/policy/discrete/value/q
discretizer: experiment/agent/policy/discretizer
projector: experiment/agent/policy/projector
representation: experiment/agent/policy/representation
sampler:
type: sampler/greedy
rand_max: 0
save_every: never
visualizer:
type: visualizer/glut
visualization:
type: visualization/field/policy/value
field_dims: [0, 1]
input_min: experiment/environment/environment/task/observation_min
input_max: experiment/environment/environment/task/observation_max
points: 65536
savepoints: 1048576
projection: mean
policy: experiment/test_agent/policy

0 comments on commit 2f0b400

Please sign in to comment.