diff --git a/config/constants.py b/config/constants.py index 6fbc803..5e9f37d 100644 --- a/config/constants.py +++ b/config/constants.py @@ -57,4 +57,4 @@ GENERATE_DIFFERENTIAL_EXPERIMENT_INFO_MATRIX = False NUMBER_OF_GROUPS = 3 NUMBER_OF_SAMPLES = 9 -GFF3_URL = "ftp://ftp.ensembl.org/pub/release-112/gff3/homo_sapiens/Homo_sapiens.GRCh38.112.gff3.gz" \ No newline at end of file +GFF3_URL = "ftp://ftp.ensembl.org/pub/release-113/gff3/homo_sapiens/Homo_sapiens.GRCh38.113.gff3.gz" \ No newline at end of file diff --git a/individuals/generator.py b/individuals/generator.py index bba8404..dc6c4b7 100644 --- a/individuals/generator.py +++ b/individuals/generator.py @@ -1,4 +1,6 @@ import numpy as np +from urllib.parse import urlparse +import os from config.constants import ( AGE_MEAN, AGE_SD, AGE_MIN, AGE_MAX, DISEASE_MASS_DISTRIBUTION, PHENOTYPIC_FEATURE_MASS_DISTRIBUTION, LAB_MIN, LAB_MAX, LAB_MEAN, P_EXCLUDED, P_SMOKING_STATUS_PRESENT, MEDICAL_ACTION_MASS_DISTRIBUTION, @@ -44,6 +46,7 @@ def __init__(self, rng): self.phenopackets = [] self.experiments = [] self.transcriptomic_matrix_generator = TranscriptomicMatrixGenerator() + self.file_path = self.get_gff_filename(GFF3_URL) # fix some probability weightings over the whole dataset self.choice_weights = { @@ -58,26 +61,35 @@ def __init__(self, rng): "synthetic_experiments": rng.gaussian_weights(len(TISSUES_WITH_EXPERIMENTS)) } + def get_gff_filename(self, url): + """ + Extracts and returns the filename from a URL. + """ + parsed_url = urlparse(url) + return os.path.basename(parsed_url.path) + def generate_and_assign_matrices(self, biosamples_rna_seq): - groups = self.transcriptomic_matrix_generator.split_into_groups(biosamples_rna_seq, NUMBER_OF_GROUPS, NUMBER_OF_SAMPLES) + # Download and process the GFF file + self.transcriptomic_matrix_generator.download_gff(GFF3_URL, self.file_path) + # Split the biosamples into groups and generate matrices + groups = self.transcriptomic_matrix_generator.split_into_groups(biosamples_rna_seq, NUMBER_OF_GROUPS, NUMBER_OF_SAMPLES) for idx, group in enumerate(groups): matrix_filename = f"counts_matrix_group_{idx + 1}.csv" - # Set biosamples for the current group - self.transcriptomic_matrix_generator.generate_gene_names(GFF3_URL) self.transcriptomic_matrix_generator.set_samples(group, NUMBER_OF_SAMPLES) counts_matrix = self.transcriptomic_matrix_generator.generate_counts_matrix() self.transcriptomic_matrix_generator.write_to_csv(counts_matrix, matrix_filename) + print(f"Counts matrix generated for group {idx + 1}") + if GENERATE_EXPERIMENT_INFO_MATRIX: experiment_info_matrix = self.transcriptomic_matrix_generator.generate_experiment_info_matrix() self.transcriptomic_matrix_generator.write_to_csv(experiment_info_matrix, f"experiment_info_matrix_group_{idx + 1}.csv") if GENERATE_DIFFERENTIAL_EXPERIMENT_INFO_MATRIX: self.transcriptomic_matrix_generator.write_differentially_expressed_genes_to_csv(f"differentially_expressed_genes_group_{idx + 1}.csv") - # Assign matrix filename to experiments metadata for each biosample in group for biosample_id in group: self.add_experiment_to_biosample(biosample_id, matrix_filename) - + def add_experiment_to_biosample(self, biosample_id, matrix_filename): # Create experiment metadata for RNA-Seq count matrix experiment_id = self.rng.uuid4() diff --git a/transcriptomics/transcriptomics_matrix_generator.py b/transcriptomics/transcriptomics_matrix_generator.py index 438227d..c7deaeb 100644 --- a/transcriptomics/transcriptomics_matrix_generator.py +++ b/transcriptomics/transcriptomics_matrix_generator.py @@ -45,23 +45,32 @@ def load_sample_info(self, json_file_path): self.treatments = [item['Treatment'] for item in sample_info] self.experiment_id = [item['ExperimentID'] for item in sample_info] - def download_gff(self, url, file_path): + def download_gff(self, url, file_path): if not os.path.exists(file_path): subprocess.run(['wget', '-O', file_path, url], check=True) + self.process_gff(file_path) + + def process_gff(self, file_path): with gzip.open(file_path, 'rt') as file: gff_data = pd.read_csv(file, sep='\t', comment='#', header=None, names=[ 'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute' ], dtype=str) genes = gff_data[gff_data['feature'] == 'gene'].copy() - genes.loc[:, 'GeneName'] = genes['attribute'].str.extract('Name=([^;]+)') + genes.loc[:, 'GeneName'] = genes['attribute'].str.extract('Name=([^;]+)', expand=False) genes.loc[:, 'length'] = genes['end'].astype(int) - genes['start'].astype(int) + 1 - genes = genes['GeneName'].dropna().tolist() - return genes + gene_info = genes[['GeneName', 'length']].dropna().drop_duplicates(subset='GeneName') + output_file_csv = 'gene_lengths.csv' + gene_info.to_csv(output_file_csv, index=False) + print(f"Gene lengths have been saved to {output_file_csv}.") + + self.gene_names = gene_info['GeneName'].tolist() + self.num_genes = len(self.gene_names) + return output_file_csv, file_path def generate_gene_names(self, url): file_name = os.path.basename(urlparse(url).path) - self.gene_names = self.download_gff(url, file_name) - self.num_genes = len(self.gene_names) + if not hasattr(self, 'gene_names') or not self.gene_names: + self.download_gff(url, file_name) return self.gene_names def split_into_groups(self, biosamples, num_groups, max_size=None): @@ -91,6 +100,15 @@ def split_into_groups(self, biosamples, num_groups, max_size=None): return groups def generate_counts_matrix(self, differential_expr_percentage=10, differential_factor=2.5, dispersion=0.2, outlier_percentage=5, outlier_factor=10): + if not self.gene_names: + raise ValueError("No gene names available for count matrix generation.") + + unique_gene_names = list(filter(None, set(self.gene_names))) + if not unique_gene_names: + raise ValueError("No valid gene names available after filtering duplicates and empty entries.") + + self.num_genes = len(unique_gene_names) + expression_levels = np.random.choice([2, 50, 100, 223, 800], size=self.num_genes, p=[0.1, 0.3, 0.3, 0.2, 0.1]) matrix = np.zeros((self.num_genes, self.num_samples), dtype=np.int32) for i in range(self.num_genes): @@ -98,7 +116,7 @@ def generate_counts_matrix(self, differential_expr_percentage=10, differential_f size = (mean_expression**2) / (mean_expression * dispersion - mean_expression**2) if mean_expression * dispersion > mean_expression else 10 matrix[i, :] = np.random.negative_binomial(n=size, p=size / (size + mean_expression), size=self.num_samples).astype(int) self.apply_modifications(matrix, differential_expr_percentage, differential_factor, outlier_percentage, outlier_factor) - df = pd.DataFrame(matrix, columns=self.sample_ids, index=self.gene_names) + df = pd.DataFrame(matrix, columns=self.sample_ids, index=unique_gene_names) df.index.name = 'GeneID' return df