-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix/remove duplicates in rcm #13
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,23 +45,32 @@ def load_sample_info(self, json_file_path): | |
self.treatments = [item['Treatment'] for item in sample_info] | ||
self.experiment_id = [item['ExperimentID'] for item in sample_info] | ||
|
||
def download_gff(self, url, file_path): | ||
def download_gff(self, url, file_path): | ||
if not os.path.exists(file_path): | ||
subprocess.run(['wget', '-O', file_path, url], check=True) | ||
self.process_gff(file_path) | ||
|
||
def process_gff(self, file_path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would rename to something like |
||
with gzip.open(file_path, 'rt') as file: | ||
gff_data = pd.read_csv(file, sep='\t', comment='#', header=None, names=[ | ||
'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute' | ||
], dtype=str) | ||
genes = gff_data[gff_data['feature'] == 'gene'].copy() | ||
genes.loc[:, 'GeneName'] = genes['attribute'].str.extract('Name=([^;]+)') | ||
genes.loc[:, 'GeneName'] = genes['attribute'].str.extract('Name=([^;]+)', expand=False) | ||
genes.loc[:, 'length'] = genes['end'].astype(int) - genes['start'].astype(int) + 1 | ||
genes = genes['GeneName'].dropna().tolist() | ||
return genes | ||
gene_info = genes[['GeneName', 'length']].dropna().drop_duplicates(subset='GeneName') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Columns should be |
||
output_file_csv = 'gene_lengths.csv' | ||
gene_info.to_csv(output_file_csv, index=False) | ||
print(f"Gene lengths have been saved to {output_file_csv}.") | ||
|
||
self.gene_names = gene_info['GeneName'].tolist() | ||
self.num_genes = len(self.gene_names) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this kind of situation, when you need to hold a list of values ( Otherwise, you have to maintain the state of 2 closely linked properties instead of just 1, which is not ideal for maintainability. |
||
return output_file_csv, file_path | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Return values are not used |
||
|
||
def generate_gene_names(self, url): | ||
file_name = os.path.basename(urlparse(url).path) | ||
self.gene_names = self.download_gff(url, file_name) | ||
self.num_genes = len(self.gene_names) | ||
if not hasattr(self, 'gene_names') or not self.gene_names: | ||
self.download_gff(url, file_name) | ||
return self.gene_names | ||
Comment on lines
70
to
74
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems that this function is not used anymore, since |
||
|
||
def split_into_groups(self, biosamples, num_groups, max_size=None): | ||
|
@@ -91,14 +100,23 @@ def split_into_groups(self, biosamples, num_groups, max_size=None): | |
return groups | ||
|
||
def generate_counts_matrix(self, differential_expr_percentage=10, differential_factor=2.5, dispersion=0.2, outlier_percentage=5, outlier_factor=10): | ||
if not self.gene_names: | ||
raise ValueError("No gene names available for count matrix generation.") | ||
|
||
unique_gene_names = list(filter(None, set(self.gene_names))) | ||
if not unique_gene_names: | ||
raise ValueError("No valid gene names available after filtering duplicates and empty entries.") | ||
Comment on lines
+103
to
+108
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is needed, since the values are already deduplicated when the |
||
|
||
self.num_genes = len(unique_gene_names) | ||
|
||
expression_levels = np.random.choice([2, 50, 100, 223, 800], size=self.num_genes, p=[0.1, 0.3, 0.3, 0.2, 0.1]) | ||
matrix = np.zeros((self.num_genes, self.num_samples), dtype=np.int32) | ||
for i in range(self.num_genes): | ||
mean_expression = expression_levels[i] | ||
size = (mean_expression**2) / (mean_expression * dispersion - mean_expression**2) if mean_expression * dispersion > mean_expression else 10 | ||
matrix[i, :] = np.random.negative_binomial(n=size, p=size / (size + mean_expression), size=self.num_samples).astype(int) | ||
self.apply_modifications(matrix, differential_expr_percentage, differential_factor, outlier_percentage, outlier_factor) | ||
df = pd.DataFrame(matrix, columns=self.sample_ids, index=self.gene_names) | ||
df = pd.DataFrame(matrix, columns=self.sample_ids, index=unique_gene_names) | ||
df.index.name = 'GeneID' | ||
return df | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would rename to
download_and_process_gff
, since it does more than just download now.