Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/remove duplicates in rcm #13

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,4 @@
GENERATE_DIFFERENTIAL_EXPERIMENT_INFO_MATRIX = False
NUMBER_OF_GROUPS = 3
NUMBER_OF_SAMPLES = 9
GFF3_URL = "ftp://ftp.ensembl.org/pub/release-112/gff3/homo_sapiens/Homo_sapiens.GRCh38.112.gff3.gz"
GFF3_URL = "ftp://ftp.ensembl.org/pub/release-113/gff3/homo_sapiens/Homo_sapiens.GRCh38.113.gff3.gz"
22 changes: 17 additions & 5 deletions individuals/generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import numpy as np
from urllib.parse import urlparse
import os
from config.constants import (
AGE_MEAN, AGE_SD, AGE_MIN, AGE_MAX, DISEASE_MASS_DISTRIBUTION, PHENOTYPIC_FEATURE_MASS_DISTRIBUTION,
LAB_MIN, LAB_MAX, LAB_MEAN, P_EXCLUDED, P_SMOKING_STATUS_PRESENT, MEDICAL_ACTION_MASS_DISTRIBUTION,
Expand Down Expand Up @@ -44,6 +46,7 @@ def __init__(self, rng):
self.phenopackets = []
self.experiments = []
self.transcriptomic_matrix_generator = TranscriptomicMatrixGenerator()
self.file_path = self.get_gff_filename(GFF3_URL)

# fix some probability weightings over the whole dataset
self.choice_weights = {
Expand All @@ -58,26 +61,35 @@ def __init__(self, rng):
"synthetic_experiments": rng.gaussian_weights(len(TISSUES_WITH_EXPERIMENTS))
}

def get_gff_filename(self, url):
"""
Extracts and returns the filename from a URL.
"""
parsed_url = urlparse(url)
return os.path.basename(parsed_url.path)

def generate_and_assign_matrices(self, biosamples_rna_seq):
groups = self.transcriptomic_matrix_generator.split_into_groups(biosamples_rna_seq, NUMBER_OF_GROUPS, NUMBER_OF_SAMPLES)
# Download and process the GFF file
self.transcriptomic_matrix_generator.download_gff(GFF3_URL, self.file_path)

# Split the biosamples into groups and generate matrices
groups = self.transcriptomic_matrix_generator.split_into_groups(biosamples_rna_seq, NUMBER_OF_GROUPS, NUMBER_OF_SAMPLES)
for idx, group in enumerate(groups):
matrix_filename = f"counts_matrix_group_{idx + 1}.csv"
# Set biosamples for the current group
self.transcriptomic_matrix_generator.generate_gene_names(GFF3_URL)
self.transcriptomic_matrix_generator.set_samples(group, NUMBER_OF_SAMPLES)
counts_matrix = self.transcriptomic_matrix_generator.generate_counts_matrix()
self.transcriptomic_matrix_generator.write_to_csv(counts_matrix, matrix_filename)
print(f"Counts matrix generated for group {idx + 1}")

if GENERATE_EXPERIMENT_INFO_MATRIX:
experiment_info_matrix = self.transcriptomic_matrix_generator.generate_experiment_info_matrix()
self.transcriptomic_matrix_generator.write_to_csv(experiment_info_matrix, f"experiment_info_matrix_group_{idx + 1}.csv")
if GENERATE_DIFFERENTIAL_EXPERIMENT_INFO_MATRIX:
self.transcriptomic_matrix_generator.write_differentially_expressed_genes_to_csv(f"differentially_expressed_genes_group_{idx + 1}.csv")

# Assign matrix filename to experiments metadata for each biosample in group
for biosample_id in group:
self.add_experiment_to_biosample(biosample_id, matrix_filename)

def add_experiment_to_biosample(self, biosample_id, matrix_filename):
# Create experiment metadata for RNA-Seq count matrix
experiment_id = self.rng.uuid4()
Expand Down
32 changes: 25 additions & 7 deletions transcriptomics/transcriptomics_matrix_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,23 +45,32 @@ def load_sample_info(self, json_file_path):
self.treatments = [item['Treatment'] for item in sample_info]
self.experiment_id = [item['ExperimentID'] for item in sample_info]

def download_gff(self, url, file_path):
def download_gff(self, url, file_path):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would rename to download_and_process_gff, since it does more than just download now.

if not os.path.exists(file_path):
subprocess.run(['wget', '-O', file_path, url], check=True)
self.process_gff(file_path)

def process_gff(self, file_path):
Copy link

@v-rocheleau v-rocheleau Nov 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would rename to something like write_gene_info, since the function now saves the gene lengths in a csv.

with gzip.open(file_path, 'rt') as file:
gff_data = pd.read_csv(file, sep='\t', comment='#', header=None, names=[
'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'
], dtype=str)
genes = gff_data[gff_data['feature'] == 'gene'].copy()
genes.loc[:, 'GeneName'] = genes['attribute'].str.extract('Name=([^;]+)')
genes.loc[:, 'GeneName'] = genes['attribute'].str.extract('Name=([^;]+)', expand=False)
genes.loc[:, 'length'] = genes['end'].astype(int) - genes['start'].astype(int) + 1
genes = genes['GeneName'].dropna().tolist()
return genes
gene_info = genes[['GeneName', 'length']].dropna().drop_duplicates(subset='GeneName')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Columns should be GeneID and GeneLength for TDS

output_file_csv = 'gene_lengths.csv'
gene_info.to_csv(output_file_csv, index=False)
print(f"Gene lengths have been saved to {output_file_csv}.")

self.gene_names = gene_info['GeneName'].tolist()
self.num_genes = len(self.gene_names)
Copy link

@v-rocheleau v-rocheleau Nov 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this kind of situation, when you need to hold a list of values (gene_names), and make decisions based the length of the list, it is best to simply get the length directly from the property when you need it. With self.gene_names, all we need to get the number of genes is len(self.gene_names).

Otherwise, you have to maintain the state of 2 closely linked properties instead of just 1, which is not ideal for maintainability.

return output_file_csv, file_path

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Return values are not used


def generate_gene_names(self, url):
file_name = os.path.basename(urlparse(url).path)
self.gene_names = self.download_gff(url, file_name)
self.num_genes = len(self.gene_names)
if not hasattr(self, 'gene_names') or not self.gene_names:
self.download_gff(url, file_name)
return self.gene_names
Comment on lines 70 to 74

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems that this function is not used anymore, since process_gff sets self.gene_names. RM


def split_into_groups(self, biosamples, num_groups, max_size=None):
Expand Down Expand Up @@ -91,14 +100,23 @@ def split_into_groups(self, biosamples, num_groups, max_size=None):
return groups

def generate_counts_matrix(self, differential_expr_percentage=10, differential_factor=2.5, dispersion=0.2, outlier_percentage=5, outlier_factor=10):
if not self.gene_names:
raise ValueError("No gene names available for count matrix generation.")

unique_gene_names = list(filter(None, set(self.gene_names)))
if not unique_gene_names:
raise ValueError("No valid gene names available after filtering duplicates and empty entries.")
Comment on lines +103 to +108

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is needed, since the values are already deduplicated when the self.gene_names property is set.


self.num_genes = len(unique_gene_names)

expression_levels = np.random.choice([2, 50, 100, 223, 800], size=self.num_genes, p=[0.1, 0.3, 0.3, 0.2, 0.1])
matrix = np.zeros((self.num_genes, self.num_samples), dtype=np.int32)
for i in range(self.num_genes):
mean_expression = expression_levels[i]
size = (mean_expression**2) / (mean_expression * dispersion - mean_expression**2) if mean_expression * dispersion > mean_expression else 10
matrix[i, :] = np.random.negative_binomial(n=size, p=size / (size + mean_expression), size=self.num_samples).astype(int)
self.apply_modifications(matrix, differential_expr_percentage, differential_factor, outlier_percentage, outlier_factor)
df = pd.DataFrame(matrix, columns=self.sample_ids, index=self.gene_names)
df = pd.DataFrame(matrix, columns=self.sample_ids, index=unique_gene_names)
df.index.name = 'GeneID'
return df

Expand Down