bento-platform · noctillion · Nov 11, 2024 · Nov 11, 2024 · v-rocheleau · Nov 12, 2024
diff --git a/config/constants.py b/config/constants.py
@@ -57,4 +57,4 @@
 GENERATE_DIFFERENTIAL_EXPERIMENT_INFO_MATRIX = False
 NUMBER_OF_GROUPS = 3
 NUMBER_OF_SAMPLES = 9
-GFF3_URL = "ftp://ftp.ensembl.org/pub/release-112/gff3/homo_sapiens/Homo_sapiens.GRCh38.112.gff3.gz"
+GFF3_URL = "ftp://ftp.ensembl.org/pub/release-113/gff3/homo_sapiens/Homo_sapiens.GRCh38.113.gff3.gz"
diff --git a/individuals/generator.py b/individuals/generator.py
@@ -1,4 +1,6 @@
 import numpy as np
+from urllib.parse import urlparse
+import os
 from config.constants import (
     AGE_MEAN, AGE_SD, AGE_MIN, AGE_MAX, DISEASE_MASS_DISTRIBUTION, PHENOTYPIC_FEATURE_MASS_DISTRIBUTION,
     LAB_MIN, LAB_MAX, LAB_MEAN, P_EXCLUDED, P_SMOKING_STATUS_PRESENT, MEDICAL_ACTION_MASS_DISTRIBUTION,
@@ -44,6 +46,7 @@ def __init__(self, rng):
         self.phenopackets = []
         self.experiments = []
         self.transcriptomic_matrix_generator = TranscriptomicMatrixGenerator()
+        self.file_path = self.get_gff_filename(GFF3_URL) 
 
         # fix some probability weightings over the whole dataset
         self.choice_weights = {
@@ -58,26 +61,35 @@ def __init__(self, rng):
             "synthetic_experiments": rng.gaussian_weights(len(TISSUES_WITH_EXPERIMENTS))
         }
 
+    def get_gff_filename(self, url):
+        """
+        Extracts and returns the filename from a URL.
+        """
+        parsed_url = urlparse(url)
+        return os.path.basename(parsed_url.path)
+
     def generate_and_assign_matrices(self, biosamples_rna_seq):
-        groups = self.transcriptomic_matrix_generator.split_into_groups(biosamples_rna_seq, NUMBER_OF_GROUPS, NUMBER_OF_SAMPLES)
+        # Download and process the GFF file
+        self.transcriptomic_matrix_generator.download_gff(GFF3_URL, self.file_path)
 
+        # Split the biosamples into groups and generate matrices
+        groups = self.transcriptomic_matrix_generator.split_into_groups(biosamples_rna_seq, NUMBER_OF_GROUPS, NUMBER_OF_SAMPLES)
         for idx, group in enumerate(groups):
             matrix_filename = f"counts_matrix_group_{idx + 1}.csv"
-            # Set biosamples for the current group
-            self.transcriptomic_matrix_generator.generate_gene_names(GFF3_URL)
             self.transcriptomic_matrix_generator.set_samples(group, NUMBER_OF_SAMPLES)
             counts_matrix = self.transcriptomic_matrix_generator.generate_counts_matrix()
             self.transcriptomic_matrix_generator.write_to_csv(counts_matrix, matrix_filename)
+            print(f"Counts matrix generated for group {idx + 1}")
+
             if GENERATE_EXPERIMENT_INFO_MATRIX:
                 experiment_info_matrix = self.transcriptomic_matrix_generator.generate_experiment_info_matrix()
                 self.transcriptomic_matrix_generator.write_to_csv(experiment_info_matrix, f"experiment_info_matrix_group_{idx + 1}.csv")
             if GENERATE_DIFFERENTIAL_EXPERIMENT_INFO_MATRIX:
                 self.transcriptomic_matrix_generator.write_differentially_expressed_genes_to_csv(f"differentially_expressed_genes_group_{idx + 1}.csv")
 
-            # Assign matrix filename to experiments metadata for each biosample in group
             for biosample_id in group:
                 self.add_experiment_to_biosample(biosample_id, matrix_filename)
-            
+
     def add_experiment_to_biosample(self, biosample_id, matrix_filename):
         # Create experiment metadata for RNA-Seq count matrix
         experiment_id = self.rng.uuid4()

diff --git a/transcriptomics/transcriptomics_matrix_generator.py b/transcriptomics/transcriptomics_matrix_generator.py
@@ -45,23 +45,32 @@ def load_sample_info(self, json_file_path):
             self.treatments = [item['Treatment'] for item in sample_info]
             self.experiment_id = [item['ExperimentID'] for item in sample_info]
 
-    def download_gff(self, url, file_path):   
+    def download_gff(self, url, file_path):
         if not os.path.exists(file_path):
             subprocess.run(['wget', '-O', file_path, url], check=True)
+        self.process_gff(file_path)
+
+    def process_gff(self, file_path):
         with gzip.open(file_path, 'rt') as file:
             gff_data = pd.read_csv(file, sep='\t', comment='#', header=None, names=[
                 'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'
             ], dtype=str)
         genes = gff_data[gff_data['feature'] == 'gene'].copy()
-        genes.loc[:, 'GeneName'] = genes['attribute'].str.extract('Name=([^;]+)')
+        genes.loc[:, 'GeneName'] = genes['attribute'].str.extract('Name=([^;]+)', expand=False)
         genes.loc[:, 'length'] = genes['end'].astype(int) - genes['start'].astype(int) + 1
-        genes = genes['GeneName'].dropna().tolist()
-        return genes
+        gene_info = genes[['GeneName', 'length']].dropna().drop_duplicates(subset='GeneName')
+        output_file_csv = 'gene_lengths.csv'
+        gene_info.to_csv(output_file_csv, index=False)
+        print(f"Gene lengths have been saved to {output_file_csv}.")
+
+        self.gene_names = gene_info['GeneName'].tolist()
+        self.num_genes = len(self.gene_names)
+        return output_file_csv, file_path
 
     def generate_gene_names(self, url):
         file_name = os.path.basename(urlparse(url).path)
-        self.gene_names = self.download_gff(url, file_name)
-        self.num_genes = len(self.gene_names)
+        if not hasattr(self, 'gene_names') or not self.gene_names:
+            self.download_gff(url, file_name)
         return self.gene_names
 
     def split_into_groups(self, biosamples, num_groups, max_size=None):
@@ -91,14 +100,23 @@ def split_into_groups(self, biosamples, num_groups, max_size=None):
         return groups
 
     def generate_counts_matrix(self, differential_expr_percentage=10, differential_factor=2.5, dispersion=0.2, outlier_percentage=5, outlier_factor=10):
+        if not self.gene_names:
+            raise ValueError("No gene names available for count matrix generation.")
+
+        unique_gene_names = list(filter(None, set(self.gene_names)))
+        if not unique_gene_names:
+            raise ValueError("No valid gene names available after filtering duplicates and empty entries.")
+
+        self.num_genes = len(unique_gene_names)
+
         expression_levels = np.random.choice([2, 50, 100, 223, 800], size=self.num_genes, p=[0.1, 0.3, 0.3, 0.2, 0.1])
         matrix = np.zeros((self.num_genes, self.num_samples), dtype=np.int32)
         for i in range(self.num_genes):
             mean_expression = expression_levels[i]
             size = (mean_expression**2) / (mean_expression * dispersion - mean_expression**2) if mean_expression * dispersion > mean_expression else 10
             matrix[i, :] = np.random.negative_binomial(n=size, p=size / (size + mean_expression), size=self.num_samples).astype(int)
         self.apply_modifications(matrix, differential_expr_percentage, differential_factor, outlier_percentage, outlier_factor)
-        df = pd.DataFrame(matrix, columns=self.sample_ids, index=self.gene_names)
+        df = pd.DataFrame(matrix, columns=self.sample_ids, index=unique_gene_names)
         df.index.name = 'GeneID'
         return df