LSSTDESC · eacharles · Sep 11, 2023 · Jul 3, 2023 · Jul 3, 2023 · Aug 2, 2023
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ photometric redshift data products, including uncertainties and summary
 statistics, and stress-test them under realistically complex systematics.
 A detailed description of RAIL's modular structure is available in the 
 [Overview](https://lsstdescrail.readthedocs.io/en/latest/source/overview.html) 
-on ReadTheDocs.
+on ReadTheDocs. 
 
 RAIL serves as the infrastructure supporting many extragalactic applications 
 of the Legacy Survey of Space and Time (LSST) on the Vera C. Rubin Observatory,

diff --git a/src/rail/estimation/algos/random_forest.py b/src/rail/estimation/algos/random_forest.py
@@ -0,0 +1,166 @@
+"""
+An example classifier that uses catalogue information to
+classify objects into tomoragphic bins using random forest.
+This is the base method in TXPipe, adapted from TXpipe/binning/random_forest.py
+Note: extra dependence on sklearn and input training file.
+"""
+
+from collections import OrderedDict
+import numpy as np
+from ceci.config import StageParameter as Param
+from rail.estimation.classifier import CatClassifier
+from rail.estimation.informer import CatInformer
+from rail.core.data import TableHandle, ModelHandle, Hdf5Handle
+from sklearn.ensemble import RandomForestClassifier as skl_RandomForestClassifier
+
+from rail.core.common_params import SHARED_PARAMS
+
+
+class randomForestmodel:
+    """
+    Temporary class to store the trained model.
+    """
+    def __init__(self, skl_classifier, features):
+        self.skl_classifier = skl_classifier
+        self.features = features
+
+
+class RandomForestInformer(CatInformer):
+    """Train the random forest classifier"""
+
+    name = 'Inform_randomForestClassifier'
+    config_options = CatInformer.config_options.copy()
+    config_options.update(
+        class_bands=Param(tuple, ["r","i","z"], msg="Which bands to use for classification"),
+        bands=Param(dict, {"r":"mag_r_lsst", "i":"mag_i_lsst", "z":"mag_z_lsst"}, msg="column names for the the bands"),
+        redshift_col=Param(str, "sz", msg="Redshift column names"),
+        bin_edges=Param(tuple, [0,0.5,1.0], msg="Binning for training data"),
+        random_seed=Param(int, msg="random seed"),
+        no_assign=Param(int, -99, msg="Value for no assignment flag"),)
+    outputs = [('model', ModelHandle)]
+
+    def __init__(self, args, comm=None):
+        CatInformer.__init__(self, args, comm=comm)
+
+    def run(self):
+        # Load the training data
+        if self.config.hdf5_groupname:
+            training_data_table = self.get_data('input')[self.config.hdf5_groupname]
+        else:  # pragma: no cover
+            training_data_table = self.get_data('input')
+
+        # Pull out the appropriate columns and combinations of the data
+        print(f"Using these bands to train the tomography selector: {self.config.bands}")
+
+        # Generate the training data that we will use
+        # We record both the name of the column and the data itself
+        features = []
+        training_data = []
+        for b1 in self.config.class_bands[:]:
+            b1_cat=self.config.bands[b1]
+            # First we use the magnitudes themselves
+            features.append(b1)
+            training_data.append(training_data_table[b1_cat])
+            # We also use the colours as training data, even the redundant ones
+            for b2 in self.config.class_bands[:]:
+                b2_cat=self.config.bands[b2]
+                if b1 < b2:
+                    features.append(f"{b1}-{b2}")
+                    training_data.append(training_data_table[b1_cat] - training_data_table[b2_cat])
+        training_data = np.array(training_data).T
+
+        print("Training data for bin classifier has shape ", training_data.shape)
+
+        # Now put the training data into redshift bins
+        # We use -99 to indicate that we are outside the desired ranges
+        z = training_data_table[self.config.redshift_col]
+        training_bin = np.repeat(self.config.no_assign, len(z))
+        print("Using these bin edges:", self.config.bin_edges)
+        for i, zmin in enumerate(self.config.bin_edges[:-1]):
+            zmax = self.config.bin_edges[i + 1]
+            training_bin[(z > zmin) & (z < zmax)] = i
+            ntrain_bin = ((z > zmin) & (z < zmax)).sum()
+            print(f"Training set: {ntrain_bin} objects in tomographic bin {i}")
+
+        # Can be replaced with any classifier
+        skl_classifier = skl_RandomForestClassifier(
+            max_depth=10,
+            max_features=None,
+            n_estimators=20,
+            random_state=self.config.random_seed,
+        )
+        skl_classifier.fit(training_data, training_bin)
+
+        #return classifier, features
+        self.model = randomForestmodel(skl_classifier, features)
+        self.add_data('model', self.model)
+
+
+class RandomForestClassifier(CatClassifier):
+    """Classifier that assigns tomographic 
+    bins based on random forest method"""
+
+    name = 'randomForestClassifier'
+    config_options = CatClassifier.config_options.copy()
+    config_options.update(
+        id_name=Param(str, "", msg="Column name for the object ID in the input data, if empty the row index is used as the ID."),
+        class_bands=Param(tuple, ["r","i","z"], msg="Which bands to use for classification"),
+        bands=Param(dict, {"r":"mag_r_lsst", "i":"mag_i_lsst", "z":"mag_z_lsst"}, msg="column names for the the bands"),)
+    outputs = [('output', Hdf5Handle)]
+
+    def __init__(self, args, comm=None):
+        CatClassifier.__init__(self, args, comm=comm)
+
+
+    def open_model(self, **kwargs):
+        CatClassifier.open_model(self, **kwargs)
+        if self.model is None:  # pragma: no cover
+            return
+        self.skl_classifier = self.model.skl_classifier
+        self.features = self.model.features
+
+
+    def run(self):
+        """Apply the classifier to the measured magnitudes"""
+
+        if self.config.hdf5_groupname:
+            test_data = self.get_data('input')[self.config.hdf5_groupname]
+        else:  # pragma: no cover
+            test_data = self.get_data('input')
+
+        data = []
+        for f in self.features:
+            # may be a single band
+            if len(f) == 1:
+                f_cat=self.config.bands[f]
+                col = test_data[f_cat]
+            # or a colour
+            else:
+                b1, b2 = f.split("-")
+                b1_cat=self.config.bands[b1]
+                b2_cat=self.config.bands[b2]
+                col = (test_data[b1_cat] - test_data[b2_cat])
+            if np.all(~np.isfinite(col)):
+                # entire column is NaN.  Hopefully this will get deselected elsewhere
+                col[:] = 30.0
+            else:
+                ok = np.isfinite(col)
+                col[~ok] = col[ok].max()
+            data.append(col)
+        data = np.array(data).T
+
+        # Run the random forest on this data chunk
+        bin_index = self.skl_classifier.predict(data)
+
+        if self.config.id_name != "":
+            # below is commented out and replaced by a redundant line
+            # because the data doesn't have ID yet
+            obj_id = test_data[self.config.id_name]
+        elif self.config.id_name == "":
+            # ID set to row index
+            b=self.config.bands[self.config.class_bands[0]]
+            obj_id = np.arange(len(test_data[b]))
+            self.config.id_name="row_index"
+
+        class_id = dict(data=OrderedDict([(self.config.id_name, obj_id), ("class_id", bin_index)]))
+        self.add_data('output', class_id)
diff --git a/src/rail/sklearn/__init__.py b/src/rail/sklearn/__init__.py
@@ -3,3 +3,4 @@
 from rail.estimation.algos.k_nearneigh import *
 from rail.estimation.algos.sklearn_neurnet import *
 from rail.estimation.algos.nz_dir import *
+from rail.estimation.algos.random_forest import *
diff --git a/tests/sklearn/test_algos.py b/tests/sklearn/test_algos.py
@@ -7,7 +7,7 @@
 from rail.core.stage import RailStage
 from rail.core.utils import RAILDIR
 from rail.core.data import TableHandle
-from rail.estimation.algos import k_nearneigh, sklearn_neurnet
+from rail.estimation.algos import k_nearneigh, sklearn_neurnet, random_forest
 
 sci_ver_str = scipy.__version__.split(".")
 
@@ -83,7 +83,6 @@ def test_KNearNeigh():
     # assert np.isclose(results.ancil['zmode'], zb_expected).all()
     assert np.isclose(results.ancil["zmode"], rerun_results.ancil["zmode"]).all()
 
-
 # test for k=1 when data point has same value, used to cause errors because of
 # a divide by zero, should be fixed now but add a test
 def test_same_data_knn():
@@ -104,10 +103,82 @@ def test_same_data_knn():
     assert ~(np.isnan(modes).all())
     os.remove(pz.get_output(pz.get_aliased_tag('output'), final_name=True))
 
-
+    
 def test_catch_bad_bands():
     params = dict(bands="u,g,r,i,z,y")
     with pytest.raises(ValueError):
         sklearn_neurnet.SklNeurNetInformer.make_stage(hdf5_groupname="", **params)
     with pytest.raises(ValueError):
         sklearn_neurnet.SklNeurNetEstimator.make_stage(hdf5_groupname="", **params)
+
+
+def test_randomForestClassifier():
+    class_bands = [ "r","i","z"]
+    bands = {"r": "mag_r_lsst", "i": "mag_i_lsst", "z": "mag_z_lsst"}
+    bin_edges=[0,0.2,0.5]
+
+    train_config_dict=dict(
+        class_bands=class_bands,
+        bands=bands,
+        redshift_col="redshift",
+        bin_edges=bin_edges,
+        random_seed=10,
+        hdf5_groupname="photometry",
+        model="model.tmp",
+    )
+
+    estim_config_dict=dict(hdf5_groupname="photometry", model="model.tmp", id_name="")
+
+    train_algo = random_forest.RandomForestInformer
+    tomo_algo = random_forest.RandomForestClassifier
+    results, rerun_results, rerun3_results = one_algo(
+        "randomForestClassifier", train_algo, tomo_algo, train_config_dict, estim_config_dict,
+        is_classifier=True,
+    )
+    assert np.isclose(results["data"]["class_id"], rerun_results["data"]["class_id"]).all()
+    assert len(results["data"]["class_id"])==len(results["data"]["row_index"])
+
+
+def test_randomForestClassifier_id():
+    class_bands = [ "r","i","z"]
+    bands = {"r": "mag_r_lsst", "i": "mag_i_lsst", "z": "mag_z_lsst"}
+    bin_edges=[0,0.2,0.5]
+
+    train_config_dict=dict(
+        class_bands=class_bands,
+        bands=bands,
+        redshift_col="redshift",
+        bin_edges=bin_edges,
+        random_seed=10,
+        hdf5_groupname="photometry",
+        model="model.tmp",
+    )
+    estim_config_dict=dict(hdf5_groupname="photometry", model="model.tmp", id_name="id")
+
+    train_algo = random_forest.RandomForestInformer
+    tomo_algo = random_forest.RandomForestClassifier
+
+    traindata = os.path.join(RAILDIR, 'rail/examples_data/testdata/training_100gal.hdf5')
+    validdata = os.path.join(RAILDIR, 'rail/examples_data/testdata/validation_10gal.hdf5')
+
+    DS = RailStage.data_store
+    DS.__class__.allow_overwrite = True
+    DS.clear()
+    training_data = DS.read_file('training_data', TableHandle, traindata)
+    validation_data = DS.read_file('validation_data', TableHandle, validdata)
+
+    train_pz = train_algo.make_stage(**train_config_dict)
+    train_pz.inform(training_data)
+    pz = tomo_algo.make_stage(name="randomForestClassifier", **estim_config_dict)
+    estim = pz.classify(training_data)
+    results=estim.data
+
+    os.remove(pz.get_output(pz.get_aliased_tag('output'), final_name=True))
+    model_file = estim_config_dict.get('model', 'None')
+    if model_file != 'None':
+        try:
+            os.remove(model_file)
+        except FileNotFoundError:  #pragma: no cover
+            pass
+
+    assert len(results["data"]["class_id"])==len(results["data"]["id"])