From c3c9cab075325dacb7666f433adaae7780982fb1 Mon Sep 17 00:00:00 2001 From: memgonzales Date: Tue, 12 Sep 2023 15:11:22 +0800 Subject: [PATCH] Optimize code for getting unique genes in other reference --- callbacks/lift_over/util.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/callbacks/lift_over/util.py b/callbacks/lift_over/util.py index ddb27b03..fab50e12 100644 --- a/callbacks/lift_over/util.py +++ b/callbacks/lift_over/util.py @@ -341,14 +341,11 @@ def get_interpro_entries(genes, interpro_mapping, iric_mapping): return [get_interpro_entry(gene, interpro_mapping, iric_mapping) for gene in genes] -def get_nb_ortholog(gene, ref): - with open(f'{Constants.NB_MAPPING}/{ref}_to_Nb.pickle', 'rb') as f: - mapping = pickle.load(f) - - if mapping[gene]: - return ', '.join(mapping[gene]) +def get_nb_ortholog(gene, ref, nb_ortholog_mapping): + if nb_ortholog_mapping[gene]: + return ', '.join(nb_ortholog_mapping[gene]) - return NULL_PLACEHOLDER + return NULL_PLACEHOLDER # ======================== @@ -601,8 +598,11 @@ def get_unique_genes_in_other_ref(ref, nb_intervals): genes_in_nb = genes_in_nb[['OGI']] # Get set difference - unique_genes = pd.concat([genes_in_other_ref, genes_in_nb, genes_in_nb]).drop_duplicates( - subset=['OGI'], keep=False) + genes_in_nb_set = set(map(tuple, genes_in_nb.values)) + genes_in_other_ref_set = set(map(tuple, genes_in_other_ref.values)) + + unique_genes = pd.DataFrame( + list(genes_in_other_ref_set.difference(genes_in_nb_set))) gene_description_df = pd.read_csv( f'{Constants.GENE_DESCRIPTIONS}/{ref}/{ref}_gene_descriptions.csv') @@ -613,8 +613,11 @@ def get_unique_genes_in_other_ref(ref, nb_intervals): unique_genes = gene_description_df.join(unique_genes, how='right') unique_genes = unique_genes.reset_index() - unique_genes['Ortholog in Nipponbare'] = unique_genes.apply( - lambda x: get_nb_ortholog(x['Name'], ref), axis=1) + with open(f'{Constants.NB_MAPPING}/{ref}_to_Nb.pickle', 'rb') as f: + nb_ortholog_mapping = pickle.load(f) + + unique_genes['Ortholog in Nipponbare'] = unique_genes.apply( + lambda x: get_nb_ortholog(x['Name'], ref, nb_ortholog_mapping), axis=1) unique_genes = unique_genes[FRONT_FACING_COLUMNS + ['Ortholog in Nipponbare']]