diff --git a/alphastats/dataset/dataset.py b/alphastats/dataset/dataset.py index 65d0b25d..da5430d1 100644 --- a/alphastats/dataset/dataset.py +++ b/alphastats/dataset/dataset.py @@ -151,12 +151,31 @@ def _check_loader(loader): ) def _create_id_dicts(self, sep: str = ";") -> Tuple[dict, dict, dict]: - """Create mapprings from gene, protein to feature and from feature to repr.""" + """ + Create mappings from gene and protein to feature, and from feature to representation. + Features are the entities measured in each sample, usually protein groups represented by semicolon separated protein ids. + This is to maintain the many-to-many relationships between the three entities feature, protein and gene. + + This method processes the raw input data to generate three dictionaries: + 1. gene_to_features_map: Maps each gene to a list of features. + 2. protein_to_features_map: Maps each protein to a list of features. + 3. feature_to_repr_map: Maps each feature to its representation string. + + Args: + sep (str): The separator used to split gene and protein identifiers. Default is ";". + + Returns: + Tuple[dict, dict, dict]: A tuple containing three dictionaries: + - gene_to_features_map (dict): A dictionary mapping genes to features. + - protein_to_features_map (dict): A dictionary mapping proteins to features. + - feature_to_repr_map (dict): A dictionary mapping features to their representation strings. + """ features = self.mat.columns.to_list() - gene_to_features_map = defaultdict(lambda: []) - protein_to_features_map = defaultdict(lambda: []) - feature_to_repr_map = defaultdict(lambda x: x) + gene_to_features_map = defaultdict(list) + protein_to_features_map = defaultdict(list) + feature_to_repr_map = {} + # TODO: Make sure both iterators are with zip after merging branches. for proteins, feature in self.rawinput[[Cols.INDEX, Cols.INDEX]].itertuples( index=False @@ -469,7 +488,8 @@ def _get_features_for_gene_name( self, gene_name: str, ) -> list: - """Get protein groups from gene id. If gene id is not present, return gene id, as we might already have a gene id. + # TODO: This should raise an error and not return the gene name if it is not actually in the data. + """Get feature from gene name. If gene name is not present, return gene name, as we might already have a gene id. 'HEL114' -> ['P18206;A0A024QZN4;V9HWK2;B3KXA2;Q5JQ13;B4DKC9;B4DTM7;A0A096LPE1'] Args: diff --git a/alphastats/loader/maxquant_loader.py b/alphastats/loader/maxquant_loader.py index 2179b500..c51da441 100644 --- a/alphastats/loader/maxquant_loader.py +++ b/alphastats/loader/maxquant_loader.py @@ -46,11 +46,13 @@ def __init__( self._set_filter_columns_to_true_false() self._read_all_column_names_as_string() + # TODO externalize to a method intensity_columns = [ col for col in self.rawinput.columns if intensity_column.replace("[sample]", "") in col ] + # TODO: explain why we do this if len(self.rawinput.dropna(subset=intensity_columns, how="all")) != len( self.rawinput ): diff --git a/alphastats/plots/intensity_plot.py b/alphastats/plots/intensity_plot.py index 07b65042..b9265d96 100644 --- a/alphastats/plots/intensity_plot.py +++ b/alphastats/plots/intensity_plot.py @@ -59,6 +59,7 @@ def __init__( self.method = method self.add_significance = add_significance self.log_scale = log_scale + # TODO: rename y_axis to make clear this is not a name from the original data self.y_axis = self.intensity_column.replace("[sample]", "").strip() if self.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]: self.y_axis = "log2(" + self.y_axis + ")" @@ -139,7 +140,6 @@ def _add_significance(plot): return plot def _prepare_data(self): - # TODO use difflib to find similar ProteinId if ProteinGroup is not present df = ( self.mat[self.protein_id].melt( ignore_index=False, diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 374d4c06..21933ecb 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1061,6 +1061,7 @@ def test_preprocess_replace_zero(self): ) def test_create_id_mapping(self): + # TODO: Test the actual dicts. """Test id maps""" self.assertEqual(len(self.obj._gene_to_features_map), 21) self.assertEqual(len(self.obj._gene_to_features_map["G14"]), 2)