Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add random forest classifier for tomographer #1

Merged
merged 14 commits into from
Sep 11, 2023
Merged
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ photometric redshift data products, including uncertainties and summary
statistics, and stress-test them under realistically complex systematics.
A detailed description of RAIL's modular structure is available in the
[Overview](https://lsstdescrail.readthedocs.io/en/latest/source/overview.html)
on ReadTheDocs.
on ReadTheDocs.

RAIL serves as the infrastructure supporting many extragalactic applications
of the Legacy Survey of Space and Time (LSST) on the Vera C. Rubin Observatory,
Expand Down
166 changes: 166 additions & 0 deletions src/rail/estimation/algos/random_forest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
"""
An example classifier that uses catalogue information to
classify objects into tomoragphic bins using random forest.
This is the base method in TXPipe, adapted from TXpipe/binning/random_forest.py
Note: extra dependence on sklearn and input training file.
"""

from collections import OrderedDict
import numpy as np
from ceci.config import StageParameter as Param
from rail.estimation.classifier import CatClassifier
from rail.estimation.informer import CatInformer
from rail.core.data import TableHandle, ModelHandle, Hdf5Handle
from sklearn.ensemble import RandomForestClassifier as skl_RandomForestClassifier

from rail.core.common_params import SHARED_PARAMS


class randomForestmodel:
"""
Temporary class to store the trained model.
"""
def __init__(self, skl_classifier, features):
self.skl_classifier = skl_classifier
self.features = features


class RandomForestInformer(CatInformer):
"""Train the random forest classifier"""

name = 'Inform_randomForestClassifier'
config_options = CatInformer.config_options.copy()
config_options.update(
class_bands=Param(tuple, ["r","i","z"], msg="Which bands to use for classification"),
bands=Param(dict, {"r":"mag_r_lsst", "i":"mag_i_lsst", "z":"mag_z_lsst"}, msg="column names for the the bands"),
redshift_col=Param(str, "sz", msg="Redshift column names"),
bin_edges=Param(tuple, [0,0.5,1.0], msg="Binning for training data"),
random_seed=Param(int, msg="random seed"),
no_assign=Param(int, -99, msg="Value for no assignment flag"),)
outputs = [('model', ModelHandle)]

def __init__(self, args, comm=None):
CatInformer.__init__(self, args, comm=comm)

def run(self):
# Load the training data
if self.config.hdf5_groupname:
training_data_table = self.get_data('input')[self.config.hdf5_groupname]
else: # pragma: no cover
training_data_table = self.get_data('input')

# Pull out the appropriate columns and combinations of the data
print(f"Using these bands to train the tomography selector: {self.config.bands}")

# Generate the training data that we will use
# We record both the name of the column and the data itself
features = []
training_data = []
for b1 in self.config.class_bands[:]:
b1_cat=self.config.bands[b1]
# First we use the magnitudes themselves
features.append(b1)
training_data.append(training_data_table[b1_cat])
# We also use the colours as training data, even the redundant ones
for b2 in self.config.class_bands[:]:
b2_cat=self.config.bands[b2]
if b1 < b2:
features.append(f"{b1}-{b2}")
training_data.append(training_data_table[b1_cat] - training_data_table[b2_cat])
training_data = np.array(training_data).T

print("Training data for bin classifier has shape ", training_data.shape)

# Now put the training data into redshift bins
# We use -99 to indicate that we are outside the desired ranges
z = training_data_table[self.config.redshift_col]
training_bin = np.repeat(self.config.no_assign, len(z))
print("Using these bin edges:", self.config.bin_edges)
for i, zmin in enumerate(self.config.bin_edges[:-1]):
zmax = self.config.bin_edges[i + 1]
training_bin[(z > zmin) & (z < zmax)] = i
ntrain_bin = ((z > zmin) & (z < zmax)).sum()
print(f"Training set: {ntrain_bin} objects in tomographic bin {i}")

# Can be replaced with any classifier
skl_classifier = skl_RandomForestClassifier(
max_depth=10,
max_features=None,
n_estimators=20,
random_state=self.config.random_seed,
)
skl_classifier.fit(training_data, training_bin)

#return classifier, features
self.model = randomForestmodel(skl_classifier, features)
self.add_data('model', self.model)


class RandomForestClassifier(CatClassifier):
"""Classifier that assigns tomographic
bins based on random forest method"""

name = 'randomForestClassifier'
config_options = CatClassifier.config_options.copy()
config_options.update(
id_name=Param(str, "", msg="Column name for the object ID in the input data, if empty the row index is used as the ID."),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing needs to be changed here, but I just want to note re: LSSTDESC/rail_base#39 that this will be the first instance of IDs in the RAIL-iverse so should be considered in decision-making about how to consistently reference these throughout src/rail code.

class_bands=Param(tuple, ["r","i","z"], msg="Which bands to use for classification"),
bands=Param(dict, {"r":"mag_r_lsst", "i":"mag_i_lsst", "z":"mag_z_lsst"}, msg="column names for the the bands"),)
outputs = [('output', Hdf5Handle)]

def __init__(self, args, comm=None):
CatClassifier.__init__(self, args, comm=comm)


def open_model(self, **kwargs):
CatClassifier.open_model(self, **kwargs)
if self.model is None: # pragma: no cover
return
self.skl_classifier = self.model.skl_classifier
self.features = self.model.features


def run(self):
"""Apply the classifier to the measured magnitudes"""

if self.config.hdf5_groupname:
test_data = self.get_data('input')[self.config.hdf5_groupname]
else: # pragma: no cover
test_data = self.get_data('input')

data = []
for f in self.features:
hangqianjun marked this conversation as resolved.
Show resolved Hide resolved
# may be a single band
if len(f) == 1:
f_cat=self.config.bands[f]
col = test_data[f_cat]
# or a colour
else:
b1, b2 = f.split("-")
b1_cat=self.config.bands[b1]
b2_cat=self.config.bands[b2]
col = (test_data[b1_cat] - test_data[b2_cat])
if np.all(~np.isfinite(col)):
# entire column is NaN. Hopefully this will get deselected elsewhere
col[:] = 30.0

Check warning on line 145 in src/rail/estimation/algos/random_forest.py

View check run for this annotation

Codecov / codecov/patch

src/rail/estimation/algos/random_forest.py#L145

Added line #L145 was not covered by tests
else:
ok = np.isfinite(col)
col[~ok] = col[ok].max()
data.append(col)
data = np.array(data).T

# Run the random forest on this data chunk
bin_index = self.skl_classifier.predict(data)

if self.config.id_name != "":
# below is commented out and replaced by a redundant line
# because the data doesn't have ID yet
obj_id = test_data[self.config.id_name]
elif self.config.id_name == "":
# ID set to row index
b=self.config.bands[self.config.class_bands[0]]
obj_id = np.arange(len(test_data[b]))
self.config.id_name="row_index"

class_id = dict(data=OrderedDict([(self.config.id_name, obj_id), ("class_id", bin_index)]))
self.add_data('output', class_id)
1 change: 1 addition & 0 deletions src/rail/sklearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from rail.estimation.algos.k_nearneigh import *
from rail.estimation.algos.sklearn_neurnet import *
from rail.estimation.algos.nz_dir import *
from rail.estimation.algos.random_forest import *
77 changes: 74 additions & 3 deletions tests/sklearn/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from rail.core.stage import RailStage
from rail.core.utils import RAILDIR
from rail.core.data import TableHandle
from rail.estimation.algos import k_nearneigh, sklearn_neurnet
from rail.estimation.algos import k_nearneigh, sklearn_neurnet, random_forest

sci_ver_str = scipy.__version__.split(".")

Expand Down Expand Up @@ -83,7 +83,6 @@ def test_KNearNeigh():
# assert np.isclose(results.ancil['zmode'], zb_expected).all()
assert np.isclose(results.ancil["zmode"], rerun_results.ancil["zmode"]).all()


# test for k=1 when data point has same value, used to cause errors because of
# a divide by zero, should be fixed now but add a test
def test_same_data_knn():
Expand All @@ -104,10 +103,82 @@ def test_same_data_knn():
assert ~(np.isnan(modes).all())
os.remove(pz.get_output(pz.get_aliased_tag('output'), final_name=True))


def test_catch_bad_bands():
params = dict(bands="u,g,r,i,z,y")
with pytest.raises(ValueError):
sklearn_neurnet.SklNeurNetInformer.make_stage(hdf5_groupname="", **params)
with pytest.raises(ValueError):
sklearn_neurnet.SklNeurNetEstimator.make_stage(hdf5_groupname="", **params)


def test_randomForestClassifier():
class_bands = [ "r","i","z"]
bands = {"r": "mag_r_lsst", "i": "mag_i_lsst", "z": "mag_z_lsst"}
bin_edges=[0,0.2,0.5]

train_config_dict=dict(
class_bands=class_bands,
bands=bands,
redshift_col="redshift",
bin_edges=bin_edges,
random_seed=10,
hdf5_groupname="photometry",
model="model.tmp",
)

estim_config_dict=dict(hdf5_groupname="photometry", model="model.tmp", id_name="")

train_algo = random_forest.RandomForestInformer
tomo_algo = random_forest.RandomForestClassifier
results, rerun_results, rerun3_results = one_algo(
"randomForestClassifier", train_algo, tomo_algo, train_config_dict, estim_config_dict,
is_classifier=True,
)
assert np.isclose(results["data"]["class_id"], rerun_results["data"]["class_id"]).all()
assert len(results["data"]["class_id"])==len(results["data"]["row_index"])


def test_randomForestClassifier_id():
class_bands = [ "r","i","z"]
bands = {"r": "mag_r_lsst", "i": "mag_i_lsst", "z": "mag_z_lsst"}
bin_edges=[0,0.2,0.5]

train_config_dict=dict(
class_bands=class_bands,
bands=bands,
redshift_col="redshift",
bin_edges=bin_edges,
random_seed=10,
hdf5_groupname="photometry",
model="model.tmp",
)
estim_config_dict=dict(hdf5_groupname="photometry", model="model.tmp", id_name="id")

train_algo = random_forest.RandomForestInformer
tomo_algo = random_forest.RandomForestClassifier

traindata = os.path.join(RAILDIR, 'rail/examples_data/testdata/training_100gal.hdf5')
validdata = os.path.join(RAILDIR, 'rail/examples_data/testdata/validation_10gal.hdf5')

DS = RailStage.data_store
DS.__class__.allow_overwrite = True
DS.clear()
training_data = DS.read_file('training_data', TableHandle, traindata)
validation_data = DS.read_file('validation_data', TableHandle, validdata)

train_pz = train_algo.make_stage(**train_config_dict)
train_pz.inform(training_data)
pz = tomo_algo.make_stage(name="randomForestClassifier", **estim_config_dict)
estim = pz.classify(training_data)
results=estim.data

os.remove(pz.get_output(pz.get_aliased_tag('output'), final_name=True))
model_file = estim_config_dict.get('model', 'None')
if model_file != 'None':
try:
os.remove(model_file)
except FileNotFoundError: #pragma: no cover
pass

assert len(results["data"]["class_id"])==len(results["data"]["id"])
Loading