diff --git a/README.md b/README.md index 220d180..78145ed 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ photometric redshift data products, including uncertainties and summary statistics, and stress-test them under realistically complex systematics. A detailed description of RAIL's modular structure is available in the [Overview](https://lsstdescrail.readthedocs.io/en/latest/source/overview.html) -on ReadTheDocs. +on ReadTheDocs. RAIL serves as the infrastructure supporting many extragalactic applications of the Legacy Survey of Space and Time (LSST) on the Vera C. Rubin Observatory, diff --git a/src/rail/estimation/algos/random_forest.py b/src/rail/estimation/algos/random_forest.py new file mode 100644 index 0000000..b2c89fb --- /dev/null +++ b/src/rail/estimation/algos/random_forest.py @@ -0,0 +1,166 @@ +""" +An example classifier that uses catalogue information to +classify objects into tomoragphic bins using random forest. +This is the base method in TXPipe, adapted from TXpipe/binning/random_forest.py +Note: extra dependence on sklearn and input training file. +""" + +from collections import OrderedDict +import numpy as np +from ceci.config import StageParameter as Param +from rail.estimation.classifier import CatClassifier +from rail.estimation.informer import CatInformer +from rail.core.data import TableHandle, ModelHandle, Hdf5Handle +from sklearn.ensemble import RandomForestClassifier as skl_RandomForestClassifier + +from rail.core.common_params import SHARED_PARAMS + + +class randomForestmodel: + """ + Temporary class to store the trained model. + """ + def __init__(self, skl_classifier, features): + self.skl_classifier = skl_classifier + self.features = features + + +class RandomForestInformer(CatInformer): + """Train the random forest classifier""" + + name = 'Inform_randomForestClassifier' + config_options = CatInformer.config_options.copy() + config_options.update( + class_bands=Param(tuple, ["r","i","z"], msg="Which bands to use for classification"), + bands=Param(dict, {"r":"mag_r_lsst", "i":"mag_i_lsst", "z":"mag_z_lsst"}, msg="column names for the the bands"), + redshift_col=Param(str, "sz", msg="Redshift column names"), + bin_edges=Param(tuple, [0,0.5,1.0], msg="Binning for training data"), + random_seed=Param(int, msg="random seed"), + no_assign=Param(int, -99, msg="Value for no assignment flag"),) + outputs = [('model', ModelHandle)] + + def __init__(self, args, comm=None): + CatInformer.__init__(self, args, comm=comm) + + def run(self): + # Load the training data + if self.config.hdf5_groupname: + training_data_table = self.get_data('input')[self.config.hdf5_groupname] + else: # pragma: no cover + training_data_table = self.get_data('input') + + # Pull out the appropriate columns and combinations of the data + print(f"Using these bands to train the tomography selector: {self.config.bands}") + + # Generate the training data that we will use + # We record both the name of the column and the data itself + features = [] + training_data = [] + for b1 in self.config.class_bands[:]: + b1_cat=self.config.bands[b1] + # First we use the magnitudes themselves + features.append(b1) + training_data.append(training_data_table[b1_cat]) + # We also use the colours as training data, even the redundant ones + for b2 in self.config.class_bands[:]: + b2_cat=self.config.bands[b2] + if b1 < b2: + features.append(f"{b1}-{b2}") + training_data.append(training_data_table[b1_cat] - training_data_table[b2_cat]) + training_data = np.array(training_data).T + + print("Training data for bin classifier has shape ", training_data.shape) + + # Now put the training data into redshift bins + # We use -99 to indicate that we are outside the desired ranges + z = training_data_table[self.config.redshift_col] + training_bin = np.repeat(self.config.no_assign, len(z)) + print("Using these bin edges:", self.config.bin_edges) + for i, zmin in enumerate(self.config.bin_edges[:-1]): + zmax = self.config.bin_edges[i + 1] + training_bin[(z > zmin) & (z < zmax)] = i + ntrain_bin = ((z > zmin) & (z < zmax)).sum() + print(f"Training set: {ntrain_bin} objects in tomographic bin {i}") + + # Can be replaced with any classifier + skl_classifier = skl_RandomForestClassifier( + max_depth=10, + max_features=None, + n_estimators=20, + random_state=self.config.random_seed, + ) + skl_classifier.fit(training_data, training_bin) + + #return classifier, features + self.model = randomForestmodel(skl_classifier, features) + self.add_data('model', self.model) + + +class RandomForestClassifier(CatClassifier): + """Classifier that assigns tomographic + bins based on random forest method""" + + name = 'randomForestClassifier' + config_options = CatClassifier.config_options.copy() + config_options.update( + id_name=Param(str, "", msg="Column name for the object ID in the input data, if empty the row index is used as the ID."), + class_bands=Param(tuple, ["r","i","z"], msg="Which bands to use for classification"), + bands=Param(dict, {"r":"mag_r_lsst", "i":"mag_i_lsst", "z":"mag_z_lsst"}, msg="column names for the the bands"),) + outputs = [('output', Hdf5Handle)] + + def __init__(self, args, comm=None): + CatClassifier.__init__(self, args, comm=comm) + + + def open_model(self, **kwargs): + CatClassifier.open_model(self, **kwargs) + if self.model is None: # pragma: no cover + return + self.skl_classifier = self.model.skl_classifier + self.features = self.model.features + + + def run(self): + """Apply the classifier to the measured magnitudes""" + + if self.config.hdf5_groupname: + test_data = self.get_data('input')[self.config.hdf5_groupname] + else: # pragma: no cover + test_data = self.get_data('input') + + data = [] + for f in self.features: + # may be a single band + if len(f) == 1: + f_cat=self.config.bands[f] + col = test_data[f_cat] + # or a colour + else: + b1, b2 = f.split("-") + b1_cat=self.config.bands[b1] + b2_cat=self.config.bands[b2] + col = (test_data[b1_cat] - test_data[b2_cat]) + if np.all(~np.isfinite(col)): + # entire column is NaN. Hopefully this will get deselected elsewhere + col[:] = 30.0 + else: + ok = np.isfinite(col) + col[~ok] = col[ok].max() + data.append(col) + data = np.array(data).T + + # Run the random forest on this data chunk + bin_index = self.skl_classifier.predict(data) + + if self.config.id_name != "": + # below is commented out and replaced by a redundant line + # because the data doesn't have ID yet + obj_id = test_data[self.config.id_name] + elif self.config.id_name == "": + # ID set to row index + b=self.config.bands[self.config.class_bands[0]] + obj_id = np.arange(len(test_data[b])) + self.config.id_name="row_index" + + class_id = dict(data=OrderedDict([(self.config.id_name, obj_id), ("class_id", bin_index)])) + self.add_data('output', class_id) diff --git a/src/rail/sklearn/__init__.py b/src/rail/sklearn/__init__.py index 9f1b747..d6aa9d5 100644 --- a/src/rail/sklearn/__init__.py +++ b/src/rail/sklearn/__init__.py @@ -3,3 +3,4 @@ from rail.estimation.algos.k_nearneigh import * from rail.estimation.algos.sklearn_neurnet import * from rail.estimation.algos.nz_dir import * +from rail.estimation.algos.random_forest import * diff --git a/tests/sklearn/test_algos.py b/tests/sklearn/test_algos.py index 3c962fe..d4a1c79 100644 --- a/tests/sklearn/test_algos.py +++ b/tests/sklearn/test_algos.py @@ -7,7 +7,7 @@ from rail.core.stage import RailStage from rail.core.utils import RAILDIR from rail.core.data import TableHandle -from rail.estimation.algos import k_nearneigh, sklearn_neurnet +from rail.estimation.algos import k_nearneigh, sklearn_neurnet, random_forest sci_ver_str = scipy.__version__.split(".") @@ -83,7 +83,6 @@ def test_KNearNeigh(): # assert np.isclose(results.ancil['zmode'], zb_expected).all() assert np.isclose(results.ancil["zmode"], rerun_results.ancil["zmode"]).all() - # test for k=1 when data point has same value, used to cause errors because of # a divide by zero, should be fixed now but add a test def test_same_data_knn(): @@ -104,10 +103,82 @@ def test_same_data_knn(): assert ~(np.isnan(modes).all()) os.remove(pz.get_output(pz.get_aliased_tag('output'), final_name=True)) - + def test_catch_bad_bands(): params = dict(bands="u,g,r,i,z,y") with pytest.raises(ValueError): sklearn_neurnet.SklNeurNetInformer.make_stage(hdf5_groupname="", **params) with pytest.raises(ValueError): sklearn_neurnet.SklNeurNetEstimator.make_stage(hdf5_groupname="", **params) + + +def test_randomForestClassifier(): + class_bands = [ "r","i","z"] + bands = {"r": "mag_r_lsst", "i": "mag_i_lsst", "z": "mag_z_lsst"} + bin_edges=[0,0.2,0.5] + + train_config_dict=dict( + class_bands=class_bands, + bands=bands, + redshift_col="redshift", + bin_edges=bin_edges, + random_seed=10, + hdf5_groupname="photometry", + model="model.tmp", + ) + + estim_config_dict=dict(hdf5_groupname="photometry", model="model.tmp", id_name="") + + train_algo = random_forest.RandomForestInformer + tomo_algo = random_forest.RandomForestClassifier + results, rerun_results, rerun3_results = one_algo( + "randomForestClassifier", train_algo, tomo_algo, train_config_dict, estim_config_dict, + is_classifier=True, + ) + assert np.isclose(results["data"]["class_id"], rerun_results["data"]["class_id"]).all() + assert len(results["data"]["class_id"])==len(results["data"]["row_index"]) + + +def test_randomForestClassifier_id(): + class_bands = [ "r","i","z"] + bands = {"r": "mag_r_lsst", "i": "mag_i_lsst", "z": "mag_z_lsst"} + bin_edges=[0,0.2,0.5] + + train_config_dict=dict( + class_bands=class_bands, + bands=bands, + redshift_col="redshift", + bin_edges=bin_edges, + random_seed=10, + hdf5_groupname="photometry", + model="model.tmp", + ) + estim_config_dict=dict(hdf5_groupname="photometry", model="model.tmp", id_name="id") + + train_algo = random_forest.RandomForestInformer + tomo_algo = random_forest.RandomForestClassifier + + traindata = os.path.join(RAILDIR, 'rail/examples_data/testdata/training_100gal.hdf5') + validdata = os.path.join(RAILDIR, 'rail/examples_data/testdata/validation_10gal.hdf5') + + DS = RailStage.data_store + DS.__class__.allow_overwrite = True + DS.clear() + training_data = DS.read_file('training_data', TableHandle, traindata) + validation_data = DS.read_file('validation_data', TableHandle, validdata) + + train_pz = train_algo.make_stage(**train_config_dict) + train_pz.inform(training_data) + pz = tomo_algo.make_stage(name="randomForestClassifier", **estim_config_dict) + estim = pz.classify(training_data) + results=estim.data + + os.remove(pz.get_output(pz.get_aliased_tag('output'), final_name=True)) + model_file = estim_config_dict.get('model', 'None') + if model_file != 'None': + try: + os.remove(model_file) + except FileNotFoundError: #pragma: no cover + pass + + assert len(results["data"]["class_id"])==len(results["data"]["id"]) \ No newline at end of file