diff --git a/src/rail/estimation/algos/somoclu_som.py b/src/rail/estimation/algos/somoclu_som.py index f4ccfc0..ec04f08 100644 --- a/src/rail/estimation/algos/somoclu_som.py +++ b/src/rail/estimation/algos/somoclu_som.py @@ -193,6 +193,10 @@ class SOMocluInformer(CatInformer): gridtype=Param(str, 'rectangular', msg="Optional parameter to specify the grid form of the nodes:" + "* 'rectangular': rectangular neurons (default)" + "* 'hexagonal': hexagonal neurons"), + n_epochs=Param(int, 10, msg="number of training epochs."), + initialization=Param(str, 'pca', msg="method of initializing the SOM:" + +"* 'pca': principal componant analysis (default)" + +"* 'random' randomly initialize the SOM"), maptype=Param(str, 'planar', msg="Optional parameter to specify the map topology:" + "* 'planar': Planar map (default)" + "* 'toroid': Toroid map"), @@ -220,7 +224,7 @@ def run(self): if np.isnan(self.config.nondetect_val): # pragma: no cover mask = np.isnan(training_data[col]) else: - mask = np.isclose(training_data[col], self.config.nondetect_val) + mask = np.logical_or(np.isinf(training_data[col]), np.isclose(training_data[col], self.config.nondetect_val)) training_data[col][mask] = self.config.mag_limits[col] colors = _computemagcolordata(training_data, self.config.ref_band, @@ -228,9 +232,9 @@ def run(self): som = Somoclu(self.config.n_columns, self.config.n_rows, gridtype=self.config.gridtype, compactsupport=False, - maptype=self.config.maptype, initialization='pca') + maptype=self.config.maptype, initialization=self.config.initialization) - som.train(colors) + som.train(colors, epochs=self.config.n_epochs,) modeldict = dict(som=som, usecols=self.config.bands, ref_column=self.config.ref_band, @@ -323,7 +327,9 @@ class SOMocluSummarizer(SZPZSummarizer): phot_weightcol=Param(str, "", msg="name of photometry weight, if present"), spec_weightcol=Param(str, "", msg="name of specz weight col, if present"), split=Param(int, 200, msg="the size of data chunks when calculating the distances between the codebook and data"), - nsamples=Param(int, 20, msg="number of bootstrap samples to generate")) + nsamples=Param(int, 20, msg="number of bootstrap samples to generate"), + useful_clusters=Param(np.ndarray, np.array([]), msg="the cluster indices that are used for calibration. If not given, then " + +"all the clusters containing spec sample are used."),) outputs = [('output', QPHandle), ('single_NZ', QPHandle), ('cellid_output', Hdf5Handle), @@ -400,7 +406,6 @@ def run(self): self.zgrid = np.linspace(self.config.zmin, self.config.zmax, self.config.nzbins + 1) - if self.config.n_clusters > self.n_rows * self.n_columns: # pragma: no cover print("Warning: number of clusters cannot be greater than the number of cells ("+str(self.n_rows * self.n_columns)+"). The SOM will NOT be grouped into clusters.") n_clusters = self.n_rows * self.n_columns @@ -445,7 +450,7 @@ def run(self): N_eff_num = 0. N_eff_den = 0. phot_cluster_set = set() - + bad_clusters = set() # make dictionary of ID data to be written out with cell IDs id_dict = {} @@ -455,7 +460,7 @@ def run(self): print(f"Process {self.rank} running summarizer on chunk {s} - {e}") chunk_number = s//self.config.chunk_size - tmp_neff_num, tmp_neff_den = self._process_chunk(test_data, bootstrap_matrix, som_cluster_inds, spec_cluster_set, phot_cluster_set, sz, spec_data['weight'], spec_som_clusterind, N_eff_p_num, N_eff_p_den, hist_vals, id_dict, s, e, first) + tmp_neff_num, tmp_neff_den = self._process_chunk(test_data, bootstrap_matrix, som_cluster_inds, spec_cluster_set, phot_cluster_set, sz, spec_data['weight'], spec_som_clusterind, N_eff_p_num, N_eff_p_den, hist_vals, id_dict, s, e, first, bad_clusters) N_eff_num += tmp_neff_num N_eff_den += tmp_neff_den first = False @@ -474,7 +479,8 @@ def run(self): hist_vals = self.comm.reduce(hist_vals) N_eff_num = self.comm.reduce(N_eff_num) N_eff_den = self.comm.reduce(N_eff_den) - + bad_clusters = self.comm.reduce(bad_clusters) + phot_cluster_list=np.array(list(phot_cluster_set),dtype=int) phot_cluster_total=self.comm.gather(phot_cluster_list) @@ -483,12 +489,8 @@ def run(self): return phot_cluster_total=np.concatenate(phot_cluster_total) phot_cluster_set = set(phot_cluster_total) - uncovered_clusters = phot_cluster_set - spec_cluster_set - bad_cluster = dict(uncovered_clusters=np.array(list(uncovered_clusters))) - print("the following clusters contain photometric data but not spectroscopic data:") - print(uncovered_clusters) - useful_clusters = phot_cluster_set - uncovered_clusters - print(f"{len(useful_clusters)} out of {n_clusters} have usable data") + + print(f"{len(self.useful_clusters)} out of {n_clusters} have usable data") # effective number defined in Heymans et al. (2012) to quantify the photometric representation. # also see Eq.7 in Wright et al. (2020). @@ -507,9 +509,9 @@ def run(self): qp_d = qp.Ensemble(qp.hist, data=dict(bins=self.zgrid, pdfs=fid_hist)) self.add_data('output', sample_ens) self.add_data('single_NZ', qp_d) - self.add_data('uncovered_cluster_file', bad_cluster) + self.add_data('uncovered_cluster_file', bad_clusters) - def _process_chunk(self, test_data, bootstrap_matrix, som_cluster_inds, spec_cluster_set, phot_cluster_set, sz, sweight, spec_som_clusterind, N_eff_p_num, N_eff_p_den, hist_vals, id_dict, start, end, first): + def _process_chunk(self, test_data, bootstrap_matrix, som_cluster_inds, spec_cluster_set, phot_cluster_set, sz, sweight, spec_som_clusterind, N_eff_p_num, N_eff_p_den, hist_vals, id_dict, start, end, first, bad_clusters): for col in self.usecols: if col not in test_data.keys(): # pragma: no cover @@ -534,13 +536,30 @@ def _process_chunk(self, test_data, bootstrap_matrix, som_cluster_inds, spec_clu self._do_chunk_output(id_dict, start, end, first) chunk_phot_cluster_set = set(phot_som_clusterind) - useful_clusters = chunk_phot_cluster_set.intersection(spec_cluster_set) phot_cluster_set.update(chunk_phot_cluster_set) - + uncovered_clusters = phot_cluster_set - spec_cluster_set + bad_cluster = dict(uncovered_clusters=np.array(list(uncovered_clusters))) + print("the following clusters contain photometric data but not spectroscopic data:") + print(uncovered_clusters) + + covered_clusters = phot_cluster_set - uncovered_clusters + if self.config.useful_clusters.size == 0: + self.useful_clusters = covered_clusters + else: # pragma: no cover + if set(self.config.useful_clusters) <= covered_clusters: + self.useful_clusters = self.config.useful_clusters + else: + print("Warning: input useful clusters is not a subset of spec-covered clusters." + +"Taking the intersection.") + self.useful_clusters = np.intersect1d(self.config.useful_clusters, np.asarray(list(covered_clusters))) + if self.useful_clusters.size == 0: # pragma: no cover + raise ValueError("Input useful clusters have no intersection with spec-covered clusters!") + + useful_clusters = self.useful_clusters + tmp_neff_num = np.sum(test_data['weight']) tmp_neff_den = np.sum(test_data['weight'] ** 2) - for i in range(self.config.nsamples): bootstrap_indices = bootstrap_matrix[:,i] bs_specz = sz[bootstrap_indices] diff --git a/tests/som/test_somoclu_summarizers.py b/tests/som/test_somoclu_summarizers.py index 49cf4ba..7fa0954 100644 --- a/tests/som/test_somoclu_summarizers.py +++ b/tests/som/test_somoclu_summarizers.py @@ -54,16 +54,20 @@ def one_algo(key, inform_class, summarizer_class, summary_kwargs): fid_ens = qp.read(summarizer2.get_output(summarizer2.get_aliased_tag("single_NZ"), final_name=True)) meanz = fid_ens.mean().flatten() assert np.isclose(meanz[0], 0.14414913252122552, atol=0.025) + + full_useful_clusters = np.asarray(list(summarizer2.useful_clusters)) + full_uncovered_clusters = np.asarray(list(np.setdiff1d(np.arange(31*31), full_useful_clusters))) + os.remove(summarizer2.get_output(summarizer2.get_aliased_tag("output"), final_name=True)) os.remove(f"tmpsomoclu_" + key + ".pkl") - return summary_ens + return summary_ens, full_useful_clusters, full_uncovered_clusters def test_SomocluSOM(): summary_config_dict = {"n_rows": 21, "n_columns": 21, "column_usage": "colors"} inform_class = somoclu_som.SOMocluInformer summarizerclass = somoclu_som.SOMocluSummarizer - _ = one_algo("SOMomoclu", inform_class, summarizerclass, summary_config_dict) + _,_,_ = one_algo("SOMomoclu", inform_class, summarizerclass, summary_config_dict) def test_SomocluSOM_with_mag_and_colors(): @@ -98,19 +102,35 @@ def test_SomocluSOM_with_columns(): } inform_class = somoclu_som.SOMocluInformer summarizerclass = somoclu_som.SOMocluSummarizer + _,_,_ = one_algo("SOMoclu_wmag", inform_class, summarizerclass, summary_config_dict) + - _ = one_algo("SOMoclu_wmag", inform_class, summarizerclass, summary_config_dict) +def test_SomocluSOM_useful_clusters(): + summary_config_dict = {"n_rows": 21, "n_columns": 21, "column_usage": "colors", "seed":0} + inform_class = somoclu_som.SOMocluInformer + summarizerclass = somoclu_som.SOMocluSummarizer + _, full_useful_clusters, full_uncovered_clusters = one_algo("SOMomoclu1", inform_class, summarizerclass, summary_config_dict) + + summary_config_dict = {"n_rows": 31, "n_columns": 31, "column_usage": "colors", "seed":0, "useful_clusters": np.arange(31*31)} + inform_class = somoclu_som.SOMocluInformer + summarizerclass = somoclu_som.SOMocluSummarizer + _ = one_algo("SOMomoclu2", inform_class, summarizerclass, summary_config_dict) + + summary_config_dict = {"n_rows": 31, "n_columns": 31, "column_usage": "colors", "seed":0, "useful_clusters": full_uncovered_clusters} + inform_class = somoclu_som.SOMocluInformer + summarizerclass = somoclu_som.SOMocluSummarizer + _ = one_algo("SOMomoclu4", inform_class, summarizerclass, summary_config_dict) -def test_SomocluSOM_with_badinput(): +def test_SomocluSOM_wrong_column(): summary_config_dict = { "n_rows": 21, "n_columns": 21, - "column_usage": "something", + "column_usage": "wrong_column", "objid_name": "id", } inform_class = somoclu_som.SOMocluInformer summarizerclass = somoclu_som.SOMocluSummarizer try: - one_algo("SOMoclu_wrong", inform_class, summarizerclass, summary_config_dict) + _ = one_algo("SOMoclu_wrongcolumn", inform_class, summarizerclass, summary_config_dict) except: return