Skip to content

Commit

Permalink
Merge pull request #1 from LSSTDESC/issue/10/tomo_bins
Browse files Browse the repository at this point in the history
Add random forest classifier for tomographer
  • Loading branch information
eacharles authored Sep 11, 2023
2 parents 8d642bd + 61f004d commit 10ba2ab
Show file tree
Hide file tree
Showing 4 changed files with 242 additions and 4 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ photometric redshift data products, including uncertainties and summary
statistics, and stress-test them under realistically complex systematics.
A detailed description of RAIL's modular structure is available in the
[Overview](https://lsstdescrail.readthedocs.io/en/latest/source/overview.html)
on ReadTheDocs.
on ReadTheDocs.

RAIL serves as the infrastructure supporting many extragalactic applications
of the Legacy Survey of Space and Time (LSST) on the Vera C. Rubin Observatory,
Expand Down
166 changes: 166 additions & 0 deletions src/rail/estimation/algos/random_forest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
"""
An example classifier that uses catalogue information to
classify objects into tomoragphic bins using random forest.
This is the base method in TXPipe, adapted from TXpipe/binning/random_forest.py
Note: extra dependence on sklearn and input training file.
"""

from collections import OrderedDict
import numpy as np
from ceci.config import StageParameter as Param
from rail.estimation.classifier import CatClassifier
from rail.estimation.informer import CatInformer
from rail.core.data import TableHandle, ModelHandle, Hdf5Handle
from sklearn.ensemble import RandomForestClassifier as skl_RandomForestClassifier

from rail.core.common_params import SHARED_PARAMS


class randomForestmodel:
"""
Temporary class to store the trained model.
"""
def __init__(self, skl_classifier, features):
self.skl_classifier = skl_classifier
self.features = features


class RandomForestInformer(CatInformer):
"""Train the random forest classifier"""

name = 'Inform_randomForestClassifier'
config_options = CatInformer.config_options.copy()
config_options.update(
class_bands=Param(tuple, ["r","i","z"], msg="Which bands to use for classification"),
bands=Param(dict, {"r":"mag_r_lsst", "i":"mag_i_lsst", "z":"mag_z_lsst"}, msg="column names for the the bands"),
redshift_col=Param(str, "sz", msg="Redshift column names"),
bin_edges=Param(tuple, [0,0.5,1.0], msg="Binning for training data"),
random_seed=Param(int, msg="random seed"),
no_assign=Param(int, -99, msg="Value for no assignment flag"),)
outputs = [('model', ModelHandle)]

def __init__(self, args, comm=None):
CatInformer.__init__(self, args, comm=comm)

def run(self):
# Load the training data
if self.config.hdf5_groupname:
training_data_table = self.get_data('input')[self.config.hdf5_groupname]
else: # pragma: no cover
training_data_table = self.get_data('input')

# Pull out the appropriate columns and combinations of the data
print(f"Using these bands to train the tomography selector: {self.config.bands}")

# Generate the training data that we will use
# We record both the name of the column and the data itself
features = []
training_data = []
for b1 in self.config.class_bands[:]:
b1_cat=self.config.bands[b1]
# First we use the magnitudes themselves
features.append(b1)
training_data.append(training_data_table[b1_cat])
# We also use the colours as training data, even the redundant ones
for b2 in self.config.class_bands[:]:
b2_cat=self.config.bands[b2]
if b1 < b2:
features.append(f"{b1}-{b2}")
training_data.append(training_data_table[b1_cat] - training_data_table[b2_cat])
training_data = np.array(training_data).T

print("Training data for bin classifier has shape ", training_data.shape)

# Now put the training data into redshift bins
# We use -99 to indicate that we are outside the desired ranges
z = training_data_table[self.config.redshift_col]
training_bin = np.repeat(self.config.no_assign, len(z))
print("Using these bin edges:", self.config.bin_edges)
for i, zmin in enumerate(self.config.bin_edges[:-1]):
zmax = self.config.bin_edges[i + 1]
training_bin[(z > zmin) & (z < zmax)] = i
ntrain_bin = ((z > zmin) & (z < zmax)).sum()
print(f"Training set: {ntrain_bin} objects in tomographic bin {i}")

# Can be replaced with any classifier
skl_classifier = skl_RandomForestClassifier(
max_depth=10,
max_features=None,
n_estimators=20,
random_state=self.config.random_seed,
)
skl_classifier.fit(training_data, training_bin)

#return classifier, features
self.model = randomForestmodel(skl_classifier, features)
self.add_data('model', self.model)


class RandomForestClassifier(CatClassifier):
"""Classifier that assigns tomographic
bins based on random forest method"""

name = 'randomForestClassifier'
config_options = CatClassifier.config_options.copy()
config_options.update(
id_name=Param(str, "", msg="Column name for the object ID in the input data, if empty the row index is used as the ID."),
class_bands=Param(tuple, ["r","i","z"], msg="Which bands to use for classification"),
bands=Param(dict, {"r":"mag_r_lsst", "i":"mag_i_lsst", "z":"mag_z_lsst"}, msg="column names for the the bands"),)
outputs = [('output', Hdf5Handle)]

def __init__(self, args, comm=None):
CatClassifier.__init__(self, args, comm=comm)


def open_model(self, **kwargs):
CatClassifier.open_model(self, **kwargs)
if self.model is None: # pragma: no cover
return
self.skl_classifier = self.model.skl_classifier
self.features = self.model.features


def run(self):
"""Apply the classifier to the measured magnitudes"""

if self.config.hdf5_groupname:
test_data = self.get_data('input')[self.config.hdf5_groupname]
else: # pragma: no cover
test_data = self.get_data('input')

data = []
for f in self.features:
# may be a single band
if len(f) == 1:
f_cat=self.config.bands[f]
col = test_data[f_cat]
# or a colour
else:
b1, b2 = f.split("-")
b1_cat=self.config.bands[b1]
b2_cat=self.config.bands[b2]
col = (test_data[b1_cat] - test_data[b2_cat])
if np.all(~np.isfinite(col)):
# entire column is NaN. Hopefully this will get deselected elsewhere
col[:] = 30.0
else:
ok = np.isfinite(col)
col[~ok] = col[ok].max()
data.append(col)
data = np.array(data).T

# Run the random forest on this data chunk
bin_index = self.skl_classifier.predict(data)

if self.config.id_name != "":
# below is commented out and replaced by a redundant line
# because the data doesn't have ID yet
obj_id = test_data[self.config.id_name]
elif self.config.id_name == "":
# ID set to row index
b=self.config.bands[self.config.class_bands[0]]
obj_id = np.arange(len(test_data[b]))
self.config.id_name="row_index"

class_id = dict(data=OrderedDict([(self.config.id_name, obj_id), ("class_id", bin_index)]))
self.add_data('output', class_id)
1 change: 1 addition & 0 deletions src/rail/sklearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from rail.estimation.algos.k_nearneigh import *
from rail.estimation.algos.sklearn_neurnet import *
from rail.estimation.algos.nz_dir import *
from rail.estimation.algos.random_forest import *
77 changes: 74 additions & 3 deletions tests/sklearn/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from rail.core.stage import RailStage
from rail.core.utils import RAILDIR
from rail.core.data import TableHandle
from rail.estimation.algos import k_nearneigh, sklearn_neurnet
from rail.estimation.algos import k_nearneigh, sklearn_neurnet, random_forest

sci_ver_str = scipy.__version__.split(".")

Expand Down Expand Up @@ -83,7 +83,6 @@ def test_KNearNeigh():
# assert np.isclose(results.ancil['zmode'], zb_expected).all()
assert np.isclose(results.ancil["zmode"], rerun_results.ancil["zmode"]).all()


# test for k=1 when data point has same value, used to cause errors because of
# a divide by zero, should be fixed now but add a test
def test_same_data_knn():
Expand All @@ -104,10 +103,82 @@ def test_same_data_knn():
assert ~(np.isnan(modes).all())
os.remove(pz.get_output(pz.get_aliased_tag('output'), final_name=True))


def test_catch_bad_bands():
params = dict(bands="u,g,r,i,z,y")
with pytest.raises(ValueError):
sklearn_neurnet.SklNeurNetInformer.make_stage(hdf5_groupname="", **params)
with pytest.raises(ValueError):
sklearn_neurnet.SklNeurNetEstimator.make_stage(hdf5_groupname="", **params)


def test_randomForestClassifier():
class_bands = [ "r","i","z"]
bands = {"r": "mag_r_lsst", "i": "mag_i_lsst", "z": "mag_z_lsst"}
bin_edges=[0,0.2,0.5]

train_config_dict=dict(
class_bands=class_bands,
bands=bands,
redshift_col="redshift",
bin_edges=bin_edges,
random_seed=10,
hdf5_groupname="photometry",
model="model.tmp",
)

estim_config_dict=dict(hdf5_groupname="photometry", model="model.tmp", id_name="")

train_algo = random_forest.RandomForestInformer
tomo_algo = random_forest.RandomForestClassifier
results, rerun_results, rerun3_results = one_algo(
"randomForestClassifier", train_algo, tomo_algo, train_config_dict, estim_config_dict,
is_classifier=True,
)
assert np.isclose(results["data"]["class_id"], rerun_results["data"]["class_id"]).all()
assert len(results["data"]["class_id"])==len(results["data"]["row_index"])


def test_randomForestClassifier_id():
class_bands = [ "r","i","z"]
bands = {"r": "mag_r_lsst", "i": "mag_i_lsst", "z": "mag_z_lsst"}
bin_edges=[0,0.2,0.5]

train_config_dict=dict(
class_bands=class_bands,
bands=bands,
redshift_col="redshift",
bin_edges=bin_edges,
random_seed=10,
hdf5_groupname="photometry",
model="model.tmp",
)
estim_config_dict=dict(hdf5_groupname="photometry", model="model.tmp", id_name="id")

train_algo = random_forest.RandomForestInformer
tomo_algo = random_forest.RandomForestClassifier

traindata = os.path.join(RAILDIR, 'rail/examples_data/testdata/training_100gal.hdf5')
validdata = os.path.join(RAILDIR, 'rail/examples_data/testdata/validation_10gal.hdf5')

DS = RailStage.data_store
DS.__class__.allow_overwrite = True
DS.clear()
training_data = DS.read_file('training_data', TableHandle, traindata)
validation_data = DS.read_file('validation_data', TableHandle, validdata)

train_pz = train_algo.make_stage(**train_config_dict)
train_pz.inform(training_data)
pz = tomo_algo.make_stage(name="randomForestClassifier", **estim_config_dict)
estim = pz.classify(training_data)
results=estim.data

os.remove(pz.get_output(pz.get_aliased_tag('output'), final_name=True))
model_file = estim_config_dict.get('model', 'None')
if model_file != 'None':
try:
os.remove(model_file)
except FileNotFoundError: #pragma: no cover
pass

assert len(results["data"]["class_id"])==len(results["data"]["id"])

0 comments on commit 10ba2ab

Please sign in to comment.