Added python reward environment.

Configuration strings starting with "@" will not be parsed.
wcaarls · Oct 25, 2024 · 2f0b400 · 2f0b400
1 parent 2f3afc5
commit 2f0b400
Show file tree

Hide file tree

Showing 5 changed files with 332 additions and 0 deletions.
diff --git a/addons/python/include/grl/environments/python.h b/addons/python/include/grl/environments/python.h
@@ -65,6 +65,33 @@ class GymEnvironment : public Environment
     void PyObjectToObservation(PyObject *obj, Observation *obs) const;
 };
 
+class PythonRewardEnvironment : public Environment
+{
+  public:
+    TYPEINFO("environment/python", "Python reward environment")
+
+  protected:
+    Environment *env_;
+    std::string reward_function_str_;
+
+    PyObject *reward_function_;
+
+  public:
+    PythonRewardEnvironment() : env_(NULL), reward_function_(NULL) { }
+
+    // From Configurable
+    virtual void request(ConfigurationRequest *config);
+    virtual void configure(Configuration &config);
+    virtual void reconfigure(const Configuration &config);
+
+    // From Environment
+    virtual void start(int test, Observation *obs);
+    virtual double step(const Action &action, Observation *obs, double *reward, int *terminal);
+
+  protected:
+    PyObject *VectorToPyObject(const Vector &v) const;
+};
+
 }
 
 #endif // GRL_PYTHON_ENVIRONMENT_H_
diff --git a/addons/python/src/python.cpp b/addons/python/src/python.cpp
@@ -30,6 +30,7 @@
 using namespace grl;
 
 REGISTER_CONFIGURABLE(GymEnvironment)
+REGISTER_CONFIGURABLE(PythonRewardEnvironment)
 
 void GymEnvironment::request(ConfigurationRequest *config)
 {
@@ -183,3 +184,116 @@ void GymEnvironment::PyObjectToObservation(PyObject *obj, Observation *obs) cons
     Py_XDECREF(val);
   }
 }
+
+// PythonRewardEnvironment ***
+
+void PythonRewardEnvironment::request(ConfigurationRequest *config)
+{
+  config->push_back(CRP("reward", "Python string specifying reward function", reward_function_str_));
+  config->push_back(CRP("environment", "environment", "Downstream environment", env_));
+}
+
+bool printPyList(PyObject* arr) {
+  PyObject *iter;
+  PyObject *item;
+  if ((iter = PyObject_GetIter(arr)) == NULL) {
+    printf("List is Empty.\n");
+    return false;
+  }
+  while ((item = PyIter_Next(iter)) != NULL) {
+    if (PyLong_Check(item)) {
+      long long_item = PyLong_AsLong(item);
+      printf("%ld\n", long_item);
+    }
+    if (PyFloat_Check(item)) {
+      float float_item = PyFloat_AsDouble(item);
+      printf("%f\n", float_item);
+    }
+    if (PyUnicode_Check(item)) {
+      const char *unicode_item = PyUnicode_AsUTF8(item);
+      printf("%s\n", unicode_item);
+    }
+    Py_DECREF(item);
+  }
+  Py_DECREF(iter);  
+  return true;
+}
+
+void PythonRewardEnvironment::configure(Configuration &config)
+{
+  reward_function_str_ = config["reward"].str();
+  env_ = (Environment*)config["environment"].ptr();
+
+  Py_Initialize();
+
+  PyObject *filename = PyUnicode_DecodeFSDefault("<internal>");
+  PyObject *code = Py_CompileStringObject(reward_function_str_.c_str(), filename, Py_file_input, NULL, 0);
+  if (!code)
+  {
+    PyErr_Print();
+    ERROR(reward_function_str_);
+    throw Exception("Couldn't compile reward function");
+  }
+  Py_DECREF(filename);
+
+  PyObject* main = PyImport_AddModule("__main__");
+  PyObject* globals = PyModule_GetDict(main);
+  PyObject *ret = PyEval_EvalCode(code, globals, globals);
+  if (!ret)
+  {
+    PyErr_Print();
+    throw Exception("Couldn't define reward function");
+  }
+  Py_DECREF(ret);
+
+  reward_function_ = PyDict_GetItemString(globals, "reward");
+  if (!reward_function_)
+  {
+    PyErr_Print();
+    throw Exception("Reward function 'reward' not defined");
+  }
+
+  Py_DECREF(globals);
+}
+
+void PythonRewardEnvironment::reconfigure(const Configuration &config)
+{
+}
+
+
+void PythonRewardEnvironment::start(int test, Observation *obs)
+{
+  env_->start(test, obs);
+}
+
+double PythonRewardEnvironment::step(const Action &action, Observation *obs, double *reward, int *terminal)
+{
+  double tau = env_->step(action, obs, reward, terminal);
+
+  // Do step
+  PyObject *args = PyTuple_New(2);
+  PyTuple_SetItem(args, 0, VectorToPyObject(action));
+  PyTuple_SetItem(args, 1, VectorToPyObject(*obs));
+
+  PyObject *ret = PyObject_CallObject(reward_function_, args);
+  if (!ret)
+  {
+    PyErr_Print();
+    throw Exception("Couldn't evaluate reward function");
+  }
+
+  Py_DECREF(args);
+  *reward = PyFloat_AsDouble(ret);
+  Py_DECREF(ret);
+
+  return tau;
+}
+
+PyObject *PythonRewardEnvironment::VectorToPyObject(const Vector &v) const
+{
+  PyObject *obj = PyTuple_New(v.size());
+  for (size_t ii=0; ii != v.size(); ++ii)
+    PyTuple_SetItem(obj, ii, PyFloat_FromDouble(v[ii]));
+
+  return obj;
+}
diff --git a/base/src/configurable.cpp b/base/src/configurable.cpp
@@ -392,6 +392,11 @@ std::string ParameterConfigurator::str() const
 {
   std::string v = value_, id, expv;
 
+  // Expressions starting with @ are not evaluated. For literal
+  // @ at the beginning of an expression, use @@.
+  if (!value_.empty() && v[0] == '@')
+    return value_.substr(1);
+
   // Resolve references
   for (size_t ii=0; ii < v.size(); ++ii)
   {

diff --git a/cfg/pendulum/ac_tc_python.yaml b/cfg/pendulum/ac_tc_python.yaml
@@ -0,0 +1,99 @@
+reward:
+  |
+  @import math
+  def reward(action, obs):
+    a = obs[0] - math.pi
+    return -5*a**2 -0.1*obs[1]**2 -action[0]**2
+experiment: 
+  type: experiment/online_learning
+  runs: 1
+  trials: 0
+  steps: 0
+  rate: 0
+  test_interval: 10
+  environment: 
+    type: environment/python
+    reward: /reward
+    environment:
+      type: environment/modeled
+      model: 
+        type: model/dynamical
+        control_step: 0.03
+        integration_steps: 5
+        dynamics: 
+          type: dynamics/pendulum
+      task: 
+        type: task/pendulum/swingup
+        timeout: 2.99
+  agent: 
+    type: agent/td
+    policy: 
+      type: mapping/policy/action
+      sigma: [1]
+      output_min: experiment/environment/environment/task/action_min
+      output_max: experiment/environment/environment/task/action_max
+      projector: 
+        type: projector/tile_coding
+        tilings: 16
+        memory: 8388608
+        resolution: [0.31415, 3.1415]
+        wrapping: [6.283, 0]
+      representation: 
+        type: representation/parameterized/linear
+        init_min: [0]
+        init_max: [1]
+        memory: experiment/agent/policy/projector/memory
+        outputs: experiment/environment/environment/task/action_dims
+        output_min: experiment/environment/environment/task/action_min
+        output_max: experiment/environment/environment/task/action_max
+    predictor: 
+      type: predictor/ac/action
+      alpha: 0.01
+      critic:
+        type: predictor/critic/td
+        alpha: 0.2
+        gamma: 0.97
+        lambda: 0.65
+        projector: experiment/agent/policy/projector
+        representation: 
+          type: representation/parameterized/linear
+          init_min: [0]
+          init_max: [1]
+          memory: experiment/agent/predictor/critic/projector/memory
+          outputs: 1
+          output_min: []
+          output_max: []
+        trace: 
+          type: trace/enumerated/replacing
+      projector: experiment/agent/policy/projector
+      representation: experiment/agent/policy/representation
+  test_agent: 
+    type: agent/fixed
+    policy: 
+      type: mapping/policy/action
+      sigma: []
+      output_min: experiment/environment/environment/task/action_min
+      output_max: experiment/environment/environment/task/action_max
+      projector: experiment/agent/policy/projector
+      representation: experiment/agent/policy/representation
+visualizer: 
+  type: visualizer/glut
+visualization: 
+  type: visualization/field/mapping
+  field_dims: [ 0, 1 ]
+  input_min: experiment/environment/environment/task/observation_min
+  input_max: experiment/environment/environment/task/observation_max
+  points: 65536
+  projection: mean
+  mapping: experiment/test_agent/policy
+  output_dim: 0
+visualization2: 
+  type: visualization/field/value
+  field_dims: [0, 1]
+  input_min: experiment/environment/environment/task/observation_min
+  input_max: experiment/environment/environment/task/observation_max
+  points: 65536
+  projection: mean
+  output_dim: 0
+  projector: experiment/agent/predictor/critic/projector
+  representation: experiment/agent/predictor/critic/representation
diff --git a/cfg/pendulum/sarsa_tc_python.yaml b/cfg/pendulum/sarsa_tc_python.yaml
@@ -0,0 +1,87 @@
+reward:
+  |
+  @import math
+  def reward(action, obs):
+    a = obs[0] - math.pi
+    return -5*a**2 -0.1*obs[1]**2 -action[0]**2
+experiment: 
+  type: experiment/online_learning
+  runs: 1
+  trials: 0
+  steps: 0
+  rate: 0
+  test_interval: 10
+  environment: 
+    type: environment/python
+    reward: /reward
+    environment:
+      type: environment/modeled
+      model: 
+        type: model/dynamical
+        control_step: 0.03
+        integration_steps: 5
+        dynamics: 
+          type: dynamics/pendulum
+      task: 
+        type: task/pendulum/swingup
+        timeout: 2.99
+        randomization: 0
+  agent: 
+    type: agent/td
+    policy: 
+      type: mapping/policy/discrete/value/q
+      discretizer: 
+        type: discretizer/uniform
+        min: experiment/environment/environment/task/action_min
+        max: experiment/environment/environment/task/action_max
+        steps: [3]
+      projector: 
+        type: projector/tile_coding
+        tilings: 16
+        memory: 8388608
+        safe: 0
+        resolution: [0.31415, 3.1415, 3]
+        wrapping: [6.283, 0, 0]
+      representation: 
+        type: representation/parameterized/linear
+        init_min: [0]
+        init_max: [1]
+        memory: experiment/agent/policy/projector/memory
+        outputs: 1
+        output_min: []
+        output_max: []
+      sampler: 
+        type: sampler/epsilon_greedy
+        rand_max: 0
+        epsilon: 0.05
+    predictor: 
+      type: predictor/critic/sarsa
+      alpha: 0.2
+      gamma: 0.97
+      lambda: 0.65
+      projector: experiment/agent/policy/projector
+      representation: experiment/agent/policy/representation
+      trace: 
+        type: trace/enumerated/replacing
+  test_agent: 
+    type: agent/fixed
+    policy: 
+      type: mapping/policy/discrete/value/q
+      discretizer: experiment/agent/policy/discretizer
+      projector: experiment/agent/policy/projector
+      representation: experiment/agent/policy/representation
+      sampler: 
+        type: sampler/greedy
+        rand_max: 0
+  save_every: never
+visualizer: 
+  type: visualizer/glut
+visualization: 
+  type: visualization/field/policy/value
+  field_dims: [0, 1]
+  input_min: experiment/environment/environment/task/observation_min
+  input_max: experiment/environment/environment/task/observation_max
+  points: 65536
+  savepoints: 1048576
+  projection: mean
+  policy: experiment/test_agent/policy