From 2f0b400d0ac6111f131bda8954a8075d6d908b5c Mon Sep 17 00:00:00 2001 From: Wouter Caarls Date: Fri, 25 Oct 2024 16:14:30 -0300 Subject: [PATCH] Added python reward environment. Configuration strings starting with "@" will not be parsed. --- .../python/include/grl/environments/python.h | 27 +++++ addons/python/src/python.cpp | 114 ++++++++++++++++++ base/src/configurable.cpp | 5 + cfg/pendulum/ac_tc_python.yaml | 99 +++++++++++++++ cfg/pendulum/sarsa_tc_python.yaml | 87 +++++++++++++ 5 files changed, 332 insertions(+) create mode 100644 cfg/pendulum/ac_tc_python.yaml create mode 100644 cfg/pendulum/sarsa_tc_python.yaml diff --git a/addons/python/include/grl/environments/python.h b/addons/python/include/grl/environments/python.h index 0cd2e425..8870beef 100644 --- a/addons/python/include/grl/environments/python.h +++ b/addons/python/include/grl/environments/python.h @@ -65,6 +65,33 @@ class GymEnvironment : public Environment void PyObjectToObservation(PyObject *obj, Observation *obs) const; }; +class PythonRewardEnvironment : public Environment +{ + public: + TYPEINFO("environment/python", "Python reward environment") + + protected: + Environment *env_; + std::string reward_function_str_; + + PyObject *reward_function_; + + public: + PythonRewardEnvironment() : env_(NULL), reward_function_(NULL) { } + + // From Configurable + virtual void request(ConfigurationRequest *config); + virtual void configure(Configuration &config); + virtual void reconfigure(const Configuration &config); + + // From Environment + virtual void start(int test, Observation *obs); + virtual double step(const Action &action, Observation *obs, double *reward, int *terminal); + + protected: + PyObject *VectorToPyObject(const Vector &v) const; +}; + } #endif // GRL_PYTHON_ENVIRONMENT_H_ diff --git a/addons/python/src/python.cpp b/addons/python/src/python.cpp index e0cff7c6..4c596c0d 100644 --- a/addons/python/src/python.cpp +++ b/addons/python/src/python.cpp @@ -30,6 +30,7 @@ using namespace grl; REGISTER_CONFIGURABLE(GymEnvironment) +REGISTER_CONFIGURABLE(PythonRewardEnvironment) void GymEnvironment::request(ConfigurationRequest *config) { @@ -183,3 +184,116 @@ void GymEnvironment::PyObjectToObservation(PyObject *obj, Observation *obs) cons Py_XDECREF(val); } } + +// PythonRewardEnvironment *** + +void PythonRewardEnvironment::request(ConfigurationRequest *config) +{ + config->push_back(CRP("reward", "Python string specifying reward function", reward_function_str_)); + config->push_back(CRP("environment", "environment", "Downstream environment", env_)); +} + +bool printPyList(PyObject* arr) { + PyObject *iter; + PyObject *item; + if ((iter = PyObject_GetIter(arr)) == NULL) { + printf("List is Empty.\n"); + return false; + } + while ((item = PyIter_Next(iter)) != NULL) { + if (PyLong_Check(item)) { + long long_item = PyLong_AsLong(item); + printf("%ld\n", long_item); + } + if (PyFloat_Check(item)) { + float float_item = PyFloat_AsDouble(item); + printf("%f\n", float_item); + } + if (PyUnicode_Check(item)) { + const char *unicode_item = PyUnicode_AsUTF8(item); + printf("%s\n", unicode_item); + } + Py_DECREF(item); + } + Py_DECREF(iter); + return true; +} + +void PythonRewardEnvironment::configure(Configuration &config) +{ + reward_function_str_ = config["reward"].str(); + env_ = (Environment*)config["environment"].ptr(); + + Py_Initialize(); + + PyObject *filename = PyUnicode_DecodeFSDefault(""); + PyObject *code = Py_CompileStringObject(reward_function_str_.c_str(), filename, Py_file_input, NULL, 0); + if (!code) + { + PyErr_Print(); + ERROR(reward_function_str_); + throw Exception("Couldn't compile reward function"); + } + Py_DECREF(filename); + + PyObject* main = PyImport_AddModule("__main__"); + PyObject* globals = PyModule_GetDict(main); + PyObject *ret = PyEval_EvalCode(code, globals, globals); + if (!ret) + { + PyErr_Print(); + throw Exception("Couldn't define reward function"); + } + Py_DECREF(ret); + + reward_function_ = PyDict_GetItemString(globals, "reward"); + if (!reward_function_) + { + PyErr_Print(); + throw Exception("Reward function 'reward' not defined"); + } + + Py_DECREF(globals); +} + +void PythonRewardEnvironment::reconfigure(const Configuration &config) +{ +} + + +void PythonRewardEnvironment::start(int test, Observation *obs) +{ + env_->start(test, obs); +} + +double PythonRewardEnvironment::step(const Action &action, Observation *obs, double *reward, int *terminal) +{ + double tau = env_->step(action, obs, reward, terminal); + + // Do step + PyObject *args = PyTuple_New(2); + PyTuple_SetItem(args, 0, VectorToPyObject(action)); + PyTuple_SetItem(args, 1, VectorToPyObject(*obs)); + + PyObject *ret = PyObject_CallObject(reward_function_, args); + if (!ret) + { + PyErr_Print(); + throw Exception("Couldn't evaluate reward function"); + } + + Py_DECREF(args); + *reward = PyFloat_AsDouble(ret); + Py_DECREF(ret); + + return tau; +} + +PyObject *PythonRewardEnvironment::VectorToPyObject(const Vector &v) const +{ + PyObject *obj = PyTuple_New(v.size()); + for (size_t ii=0; ii != v.size(); ++ii) + PyTuple_SetItem(obj, ii, PyFloat_FromDouble(v[ii])); + + return obj; +} diff --git a/base/src/configurable.cpp b/base/src/configurable.cpp index 7f1cf483..53bdaad7 100644 --- a/base/src/configurable.cpp +++ b/base/src/configurable.cpp @@ -392,6 +392,11 @@ std::string ParameterConfigurator::str() const { std::string v = value_, id, expv; + // Expressions starting with @ are not evaluated. For literal + // @ at the beginning of an expression, use @@. + if (!value_.empty() && v[0] == '@') + return value_.substr(1); + // Resolve references for (size_t ii=0; ii < v.size(); ++ii) { diff --git a/cfg/pendulum/ac_tc_python.yaml b/cfg/pendulum/ac_tc_python.yaml new file mode 100644 index 00000000..59698eba --- /dev/null +++ b/cfg/pendulum/ac_tc_python.yaml @@ -0,0 +1,99 @@ +reward: + | + @import math + def reward(action, obs): + a = obs[0] - math.pi + return -5*a**2 -0.1*obs[1]**2 -action[0]**2 +experiment: + type: experiment/online_learning + runs: 1 + trials: 0 + steps: 0 + rate: 0 + test_interval: 10 + environment: + type: environment/python + reward: /reward + environment: + type: environment/modeled + model: + type: model/dynamical + control_step: 0.03 + integration_steps: 5 + dynamics: + type: dynamics/pendulum + task: + type: task/pendulum/swingup + timeout: 2.99 + agent: + type: agent/td + policy: + type: mapping/policy/action + sigma: [1] + output_min: experiment/environment/environment/task/action_min + output_max: experiment/environment/environment/task/action_max + projector: + type: projector/tile_coding + tilings: 16 + memory: 8388608 + resolution: [0.31415, 3.1415] + wrapping: [6.283, 0] + representation: + type: representation/parameterized/linear + init_min: [0] + init_max: [1] + memory: experiment/agent/policy/projector/memory + outputs: experiment/environment/environment/task/action_dims + output_min: experiment/environment/environment/task/action_min + output_max: experiment/environment/environment/task/action_max + predictor: + type: predictor/ac/action + alpha: 0.01 + critic: + type: predictor/critic/td + alpha: 0.2 + gamma: 0.97 + lambda: 0.65 + projector: experiment/agent/policy/projector + representation: + type: representation/parameterized/linear + init_min: [0] + init_max: [1] + memory: experiment/agent/predictor/critic/projector/memory + outputs: 1 + output_min: [] + output_max: [] + trace: + type: trace/enumerated/replacing + projector: experiment/agent/policy/projector + representation: experiment/agent/policy/representation + test_agent: + type: agent/fixed + policy: + type: mapping/policy/action + sigma: [] + output_min: experiment/environment/environment/task/action_min + output_max: experiment/environment/environment/task/action_max + projector: experiment/agent/policy/projector + representation: experiment/agent/policy/representation +visualizer: + type: visualizer/glut +visualization: + type: visualization/field/mapping + field_dims: [ 0, 1 ] + input_min: experiment/environment/environment/task/observation_min + input_max: experiment/environment/environment/task/observation_max + points: 65536 + projection: mean + mapping: experiment/test_agent/policy + output_dim: 0 +visualization2: + type: visualization/field/value + field_dims: [0, 1] + input_min: experiment/environment/environment/task/observation_min + input_max: experiment/environment/environment/task/observation_max + points: 65536 + projection: mean + output_dim: 0 + projector: experiment/agent/predictor/critic/projector + representation: experiment/agent/predictor/critic/representation diff --git a/cfg/pendulum/sarsa_tc_python.yaml b/cfg/pendulum/sarsa_tc_python.yaml new file mode 100644 index 00000000..162f3818 --- /dev/null +++ b/cfg/pendulum/sarsa_tc_python.yaml @@ -0,0 +1,87 @@ +reward: + | + @import math + def reward(action, obs): + a = obs[0] - math.pi + return -5*a**2 -0.1*obs[1]**2 -action[0]**2 +experiment: + type: experiment/online_learning + runs: 1 + trials: 0 + steps: 0 + rate: 0 + test_interval: 10 + environment: + type: environment/python + reward: /reward + environment: + type: environment/modeled + model: + type: model/dynamical + control_step: 0.03 + integration_steps: 5 + dynamics: + type: dynamics/pendulum + task: + type: task/pendulum/swingup + timeout: 2.99 + randomization: 0 + agent: + type: agent/td + policy: + type: mapping/policy/discrete/value/q + discretizer: + type: discretizer/uniform + min: experiment/environment/environment/task/action_min + max: experiment/environment/environment/task/action_max + steps: [3] + projector: + type: projector/tile_coding + tilings: 16 + memory: 8388608 + safe: 0 + resolution: [0.31415, 3.1415, 3] + wrapping: [6.283, 0, 0] + representation: + type: representation/parameterized/linear + init_min: [0] + init_max: [1] + memory: experiment/agent/policy/projector/memory + outputs: 1 + output_min: [] + output_max: [] + sampler: + type: sampler/epsilon_greedy + rand_max: 0 + epsilon: 0.05 + predictor: + type: predictor/critic/sarsa + alpha: 0.2 + gamma: 0.97 + lambda: 0.65 + projector: experiment/agent/policy/projector + representation: experiment/agent/policy/representation + trace: + type: trace/enumerated/replacing + test_agent: + type: agent/fixed + policy: + type: mapping/policy/discrete/value/q + discretizer: experiment/agent/policy/discretizer + projector: experiment/agent/policy/projector + representation: experiment/agent/policy/representation + sampler: + type: sampler/greedy + rand_max: 0 + save_every: never +visualizer: + type: visualizer/glut +visualization: + type: visualization/field/policy/value + field_dims: [0, 1] + input_min: experiment/environment/environment/task/observation_min + input_max: experiment/environment/environment/task/observation_max + points: 65536 + savepoints: 1048576 + projection: mean + policy: experiment/test_agent/policy