From c3c9cab075325dacb7666f433adaae7780982fb1 Mon Sep 17 00:00:00 2001
From: memgonzales <gonzales.markedward@gmail.com>
Date: Tue, 12 Sep 2023 15:11:22 +0800
Subject: [PATCH] Optimize code for getting unique genes in other reference

---
 callbacks/lift_over/util.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/callbacks/lift_over/util.py b/callbacks/lift_over/util.py
index ddb27b03..fab50e12 100644
--- a/callbacks/lift_over/util.py
+++ b/callbacks/lift_over/util.py
@@ -341,14 +341,11 @@ def get_interpro_entries(genes, interpro_mapping, iric_mapping):
     return [get_interpro_entry(gene, interpro_mapping, iric_mapping) for gene in genes]
 
 
-def get_nb_ortholog(gene, ref):
-    with open(f'{Constants.NB_MAPPING}/{ref}_to_Nb.pickle', 'rb') as f:
-        mapping = pickle.load(f)
-
-        if mapping[gene]:
-            return ', '.join(mapping[gene])
+def get_nb_ortholog(gene, ref, nb_ortholog_mapping):
+    if nb_ortholog_mapping[gene]:
+        return ', '.join(nb_ortholog_mapping[gene])
 
-        return NULL_PLACEHOLDER
+    return NULL_PLACEHOLDER
 
 
 # ========================
@@ -601,8 +598,11 @@ def get_unique_genes_in_other_ref(ref, nb_intervals):
     genes_in_nb = genes_in_nb[['OGI']]
 
     # Get set difference
-    unique_genes = pd.concat([genes_in_other_ref, genes_in_nb, genes_in_nb]).drop_duplicates(
-        subset=['OGI'], keep=False)
+    genes_in_nb_set = set(map(tuple, genes_in_nb.values))
+    genes_in_other_ref_set = set(map(tuple, genes_in_other_ref.values))
+
+    unique_genes = pd.DataFrame(
+        list(genes_in_other_ref_set.difference(genes_in_nb_set)))
 
     gene_description_df = pd.read_csv(
         f'{Constants.GENE_DESCRIPTIONS}/{ref}/{ref}_gene_descriptions.csv')
@@ -613,8 +613,11 @@ def get_unique_genes_in_other_ref(ref, nb_intervals):
     unique_genes = gene_description_df.join(unique_genes, how='right')
     unique_genes = unique_genes.reset_index()
 
-    unique_genes['Ortholog in Nipponbare'] = unique_genes.apply(
-        lambda x: get_nb_ortholog(x['Name'], ref), axis=1)
+    with open(f'{Constants.NB_MAPPING}/{ref}_to_Nb.pickle', 'rb') as f:
+        nb_ortholog_mapping = pickle.load(f)
+
+        unique_genes['Ortholog in Nipponbare'] = unique_genes.apply(
+            lambda x: get_nb_ortholog(x['Name'], ref, nb_ortholog_mapping), axis=1)
 
     unique_genes = unique_genes[FRONT_FACING_COLUMNS +
                                 ['Ortholog in Nipponbare']]