-
Notifications
You must be signed in to change notification settings - Fork 27
/
parenthood_lib.py
693 lines (539 loc) · 22.2 KB
/
parenthood_lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Utilities for parsing EC and GO labels, and finding their parents."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import itertools
import os
import re
import typing
from typing import (Collection, Dict, FrozenSet, Iterable, List, Optional, Set,
Text, Tuple)
import pandas as pd
import utils
import tensorflow.compat.v1 as tf
import tqdm
# From ftp://ftp.expasy.org/databases/enzyme/enzyme.dat
DATA_DIR = 'data/'
EC_LEAF_NODE_METADATA_PATH = os.path.join(DATA_DIR, 'enzyme.dat')
# From ftp://ftp.expasy.org/databases/enzyme/enzclass.txt
EC_NON_LEAF_NODE_METADATA_PATH = os.path.join(DATA_DIR, 'enzclass.txt')
# From http://purl.obolibrary.org/obo/go.obo
GO_METADATA_PATH = os.path.join(DATA_DIR, 'go.obo')
# Labels that are implied by other labels.
# Json is a map from string key (label) to list of applicable/implied
# labels (string).
APPLICABLE_LABEL_JSON_PATH = os.path.join(DATA_DIR, 'parenthood.json.gz')
# GO:followed by seven digits, followed by
# either a space, or the end of the line.
GO_TERM_REGEX = re.compile(r'(GO:\d\d\d\d\d\d\d)( |$)')
# Allows numbers in all positions, or a hyphen in latter positions, or an 'n',
# indicating that the number is still undergoing consideration and review:
# https://www.ebi.ac.uk/ena/WebFeat/qualifiers/EC_number.html
EC_NUMBER_REGEX = r'(\d+).([\d\-n]+).([\d\-n]+).([\d\-n]+)'
_TOP_LEVEL_EC_CLASS_VALUE = '-.-.-.-'
# Format of lines at ftp://ftp.expasy.org/databases/enzyme/enzclass.txt
# that contain an EC number.
_NON_LEAF_NODE_LINE_REGEX = re.compile(r'^\d\.')
# The determination that these terms are either parenthood or not was made
# with the help of this document:
# https://owlcollab.github.io/oboformat/doc/GO.format.obo-1_2.html
_IDENTITY_TYPE_GO_RELATIONS = {
# Used for obsolete terms.
'replaced_by',
# Is basically a synonym.
'alt_id',
}
_PARENTHOOD_TYPE_GO_RELATIONS = {
# Clearly a transitive parenthood relation.
'is_a',
}
_NON_PARENTHOOD_TYPE_GO_RELATIONS = {
# -------- Tag types that are clearly not parenthood relations -----------
'comment',
'created_by',
'creation_date',
'def',
'disjoint_from',
'id',
'is_obsolete',
'name',
'namespace',
'property_value',
# -------- Tag types that seem like they'd be parenthood relations -------
# Gives a term which may be an appropriate substitute for an obsolete
# term, but needs to be looked at carefully by a human expert before the
# replacement is done.
'consider',
# Describes subsets of go terms that may be useful for different uses.
'subset',
# Contains references to databases other than GO terms.
'synonym',
# As of August 10 2019, the following are the types of relationships
# listed in the canonical GO ontology OBO file:
# {'ends_during',
# 'happens_during',
# 'has_part',
# 'negatively_regulates',
# 'occurs_in',
# 'part_of',
# 'positively_regulates',
# 'regulates'}
# None of these qualifies for propagation to children because of "is-a"
# type semantics.
'relationship',
# Contains references to databases other than GO terms.
'xref',
# "intersection_of" This tag indicates that this term is equivalent to the
# intersection of several other terms. Many times, one of the terms
# is already in an is_a relationship, and the other intersection term has
# a non-parenthood type relation, e.g. has-a. For this reason, we exclude
# intersection_of relationships.
'intersection_of',
}
_IS_NON_CANONICAL_ALT_ID_LABEL_OF = 'was_alt_id_of'
TermID = Text
RelationDescriptor = Text
# GO is an ontology of relations of "term" "related to" "other term".
# A GoAttribute describes the right-hand-side of this.
GoAttribute = Tuple[RelationDescriptor, TermID]
class GoTerm(
typing.NamedTuple('GoTerm',
(('term_id', Text), ('term_name', Optional[Text]),
('description', Optional[Text]),
('related_labels', Set[GoAttribute])))):
"""A Gene Ontology term.
A Gene Ontology term is a term that's related to other terms via an enumerated
set of relation types. Not all relations are "is-a" relations.
[1] Describes the format that go terms are presented in.
[1] https://owlcollab.github.io/oboformat/doc/GO.format.obo-1_2.html
Attributes:
term_id: id of term. E.g. GO:0000108
term_name: value of "name:" tag in obo file.
description: value of "def:" tag in obo file.
related_labels: set of [relation, term_id] that also apply to this term.
These labels are NOT transitive.
"""
@classmethod
def from_string(cls, s):
"""Parses a Term block of OBO file, keeping identity and parent relations.
https://owlcollab.github.io/oboformat/doc/GO.format.obo-1_2.html
Args:
s: block of OBO file that starts with [Term]
Returns:
GoTerm
"""
lines = s.split('\n')
attributes = [
_parse_go_attribute(l) for l in lines if _is_go_attribute_line(l)
]
term_ids = [value for tag, value in attributes if tag == 'id']
if len(term_ids) != 1:
raise ValueError(('Number of term names for term was {} '
'(expected exactly one). Term was {}.').format(
len(term_ids), attributes))
term_id = term_ids[0]
related_labels = set()
term_description = None
term_name = None
for tag, value in attributes:
if tag == 'name':
term_name = value
if tag == 'def':
term_description = re.findall('"(.*)" .*', value)[0]
if tag in _PARENTHOOD_TYPE_GO_RELATIONS or tag in _IDENTITY_TYPE_GO_RELATIONS:
related_labels.add((tag, _get_go_term_from_text(value)))
elif tag in _NON_PARENTHOOD_TYPE_GO_RELATIONS:
continue
else:
valid_relations = _PARENTHOOD_TYPE_GO_RELATIONS.union(
_NON_PARENTHOOD_TYPE_GO_RELATIONS).union(
_IDENTITY_TYPE_GO_RELATIONS)
raise ValueError('Term type unknown: was {} and expected one of {}. '
'Full value was {}'.format(tag, valid_relations, s))
return GoTerm(
term_id=term_id,
term_name=term_name,
description=term_description,
related_labels=related_labels)
def _is_go_attribute_line(s):
return ': ' in s
def _parse_go_attribute(s):
split = s.split(': ')
return (split[0], ''.join(split[1:]))
def _get_go_term_from_text(s):
matches = GO_TERM_REGEX.findall(s)
if len(matches) != 1:
raise ValueError(
'Expected exactly one match for a GO term in string {}. Found matches {}'
.format(s, matches))
# First match, looking at the go term, not the (space or end-of-line).
return matches[0][0]
def _yield_terms_for_alt_ids(term):
"""Yields GoTerms that point to the root term for all alt_ids in `term`.
Alt ids do not have their own term in the ontology, so this function is used
to create these terms and to canonicalize these alternative ids to their
preferred ids.
Args:
term: GoTerm. May or may not have alt_ids in its parents.
Yields:
GoTerm for each alt_id of `term`, whose parent labels are only the
label of term.
"""
for relation, related_term_id in term.related_labels:
if relation == 'alt_id':
related_labels = {(_IS_NON_CANONICAL_ALT_ID_LABEL_OF, term.term_id)}
non_canonical_term = GoTerm(related_term_id, term.term_name,
term.description, related_labels)
yield non_canonical_term
yield GoTerm(
term_id=term.term_id,
term_name=term.term_name,
description=term.description,
related_labels={l for l in term.related_labels if l[0] != 'alt_id'})
def parse_full_go_file(file_contents = None):
"""Parses contents of OBO file containing the GO ontology.
Args:
file_contents: string. File contents of go file.
Returns:
List of GoTerm.
"""
if file_contents is None:
with tf.io.gfile.Open(GO_METADATA_PATH) as f:
file_contents = f.read()
unparsed_terms = [
x for x in file_contents.split('\n\n') if x.startswith('[Term]')
]
parsed_terms = [GoTerm.from_string(t) for t in unparsed_terms]
with_alt_ids_itr = itertools.chain(*(_yield_terms_for_alt_ids(x)
for x in parsed_terms))
return list(with_alt_ids_itr)
def go_label_to_description(
go_file_contents = None):
return {
t.term_id: f'{t.term_name}' for t in parse_full_go_file(go_file_contents)
}
def _go_term_applicable_labels_should_include_themselves(term):
"""Return whether this go term is canonical (should include itself) or not.
If a term is an alt_id of something, or is obsolete (has a replaced_by
relation), it is not an applicable label.
Args:
term: GoTerm.
Returns:
bool
"""
is_replaced_by = any(relation_type == 'replaced_by'
for relation_type, _ in term.related_labels)
alt_id = any(relation_type == _IS_NON_CANONICAL_ALT_ID_LABEL_OF
for relation_type, _ in term.related_labels)
return (not is_replaced_by) and (not alt_id)
def transitive_go_parenthood(go_terms):
"""Converts GoTerms (no transitive relations) to include transitive parents.
Includes itself as one of its parents, with the exception of alt_ids and
replaced_by tags.
When a node has alt ids, its only parent is the term for which it is an
alt_id. Note that a term may only be an alt_id for one term [1].
When a node is obsolete, it has one or more replaced_by tags [2], and this
obsolete name will not be included in the transitive right-hand-side.
[1]
https://github.com/geneontology/go-ontology/blob/7be0df46781f2e3a456a3e178def19dcdbb20ecf/src/util/check-obo-for-standard-release.pl#L134-L140
[2] https://owlcollab.github.io/oboformat/doc/GO.format.obo-1_4.html
Args:
go_terms: List of GoTerm.
Returns:
Dict of term_id -> transitive set of parent term names.
"""
go_term_dict = {
t.term_id: frozenset(label for _, label in t.related_labels)
for t in go_terms
}
transitive_go_terms = {
t.term_id: _transitive_parenthood(t.term_id, go_term_dict)
for t in tqdm.tqdm(go_terms, position=0)
}
terms_whose_labels_should_include_themselves = frozenset(
term.term_id
for term in go_terms
if _go_term_applicable_labels_should_include_themselves(term))
# Add in self to parents when it's not an alt_id or obsolete label. See
# docstring for more information.
for term_id in terms_whose_labels_should_include_themselves:
transitive_go_terms[term_id].add(term_id)
return transitive_go_terms
def _transitive_parenthood(key,
term_dict):
"""Finds all parents, transitively, of `key` in `term_dict`.
Does not include itself in the set of parents, regardless of the type of
relation. This is left to the caller to decide.
Args:
key: Go Term, e.g. GO:0000001
term_dict: Go term to set of parent go terms.
Returns:
Set of transitive parent go terms of `key`.
"""
running_total = set()
to_examine = set(term_dict[key]) # Make a copy so we can pop from it.
while len(to_examine) > 0: # pylint: disable=g-explicit-length-test
cur_element = to_examine.pop()
running_total.add(cur_element)
for potential in term_dict[cur_element]:
if potential not in running_total:
to_examine.add(potential)
return running_total
def _replace_one_level_up_with_dash_for_ec(s):
"""Finds direct parent of a label.
Args:
s: e.g. 1.2.3.4. Values including 'n' in one of their numbers [1] are
treated like every other value. Non leaf nodes (those including a hyphen)
are also allowed.
Returns:
E.g. 1.2.-.-
"""
if s.count('-') == 0:
return re.sub(EC_NUMBER_REGEX, '\\1.\\2.\\3.-', s)
if s.count('-') == 1:
return re.sub(EC_NUMBER_REGEX, '\\1.\\2.-.-', s)
if s.count('-') == 2:
return re.sub(EC_NUMBER_REGEX, '\\1.-.-.-', s)
if s.count('-') == 3:
return re.sub(EC_NUMBER_REGEX, '-.-.-.-', s)
raise ValueError('Expected the number of hyphens in string to be between '
'0 and 3 (string was {}). Check that the input matches the '
'regex {}'.format(s, EC_NUMBER_REGEX))
def _all_ec_parents_for_label(label):
"""Computes all parents for an EC label.
Does not include top level EC (level 0) value -.-.-.- in output.
Args:
label: e.g. 1.2.3.4. Values including 'n' in one of their numbers [1] are
treated like every other value. Non leaf nodes (those including a hyphen)
are also allowed.
Returns:
For e.g., 1.2.3.-, returns set(1.2.3.-, 1.2.-.-, 1.-.-.-)
That is, this includes both the input `label`, as well as the root node
-.-.-.-.
"""
parent = label
parents_set = set()
while parent != _TOP_LEVEL_EC_CLASS_VALUE: # Exclude -.-.-.- from output.
parents_set.add(parent) # First loop adds self to parenthood.
parent = _replace_one_level_up_with_dash_for_ec(parent)
return parents_set
def _get_leaf_node_ec_labels_from_file_contents(
enzyme_dat_file_contents = None):
"""Parses enzyme.dat file [1] into EC numbers and descriptions.
[1] ftp://ftp.expasy.org/databases/enzyme/enzyme.dat
[2] ftp://ftp.expasy.org/databases/enzyme/enzuser.txt
Args:
enzyme_dat_file_contents: Text of file at [1]. Follows format at [2].
Contains only information about leaf nodes of the EC hierarchy (labels
with no hyphens; e.g. 1.2.3.4). If None, the current file is parsed from
disk.
Returns:
List of string like "1.2.3.4", "oxalic acid oxidase".
Note: ec numbers do not include the string "EC".
"""
if enzyme_dat_file_contents is None:
with tf.io.gfile.Open(EC_LEAF_NODE_METADATA_PATH) as f:
enzyme_dat_file_contents = f.read()
ids_and_descriptions = []
# Beginning of EC file does not have to do with term parsing; we omit
# the "0th" ID entry. See [1] in docstring for the format.
id_blocks = enzyme_dat_file_contents.split('\nID')[1:]
for block in id_blocks:
lines_in_block = block.split('\n')
term_id = re.findall(r'\s+(.*)', lines_in_block[0])[0]
desc = ''
for line in block.split('\n'):
if line.startswith('DE'):
desc += re.findall(r'DE\s+(.*)', line)[0]
ids_and_descriptions.append((term_id, desc))
return ids_and_descriptions
def _get_non_leaf_node_ec_labels_from_file_contents(
enzyme_class_file_contents = None
):
"""Parses enzclass.txt file [1] into EC numbers and descriptions.
[1] ftp://ftp.expasy.org/databases/enzyme/enzclass.txt
Args:
enzyme_class_file_contents: Text of file at [1]. Contains only information
about non-leaf nodes of the EC hierarchy (e.g. 1.2.3.-). If None, the
current file is parsed from disk.
Returns:
List of string like "1.-.-.-", "oxidoreductase".
Note: ec numbers do not include the string "EC".
"""
if enzyme_class_file_contents is None:
with tf.io.gfile.Open(EC_NON_LEAF_NODE_METADATA_PATH) as f:
enzyme_class_file_contents = f.read()
non_leaf_node_label_lines = [
l.strip()
for l in enzyme_class_file_contents.split('\n')
if _NON_LEAF_NODE_LINE_REGEX.match(l)
]
terms_and_descriptions = []
for line in non_leaf_node_label_lines:
term_id = ''.join(line[0:9]).replace(' ', '')
term_description = re.findall(r'.*.-\s+(.*)', line)[0]
terms_and_descriptions.append((term_id, term_description))
return terms_and_descriptions
def ec_label_to_description(
enzyme_dat_file_contents = None,
enzyme_class_file_contents = None):
"""Get dictionary from EC label to description.
[1] ftp://ftp.expasy.org/databases/enzyme/enzyme.dat
[2] ftp://ftp.expasy.org/databases/enzyme/enzuser.txt
[3] ftp://ftp.expasy.org/databases/enzyme/enzclass.txt
Args:
enzyme_dat_file_contents: Text of file at [1]. Follows format at [2].
Contains only information about leaf nodes of the EC hierarchy (labels
with no hyphens; e.g. 1.2.3.4).
enzyme_class_file_contents: Text of file at [3]. Contains only information
about non-leaf nodes of the EC hierarchy (e.g. 1.2.3.-).
Returns:
Dictionary from EC label to description. Non root-level terms DO NOT have
their parents information included, for easier human-readability.
"""
leaves = _get_leaf_node_ec_labels_from_file_contents(enzyme_dat_file_contents)
non_leaves = _get_non_leaf_node_ec_labels_from_file_contents(
enzyme_class_file_contents)
term_to_description = {}
for term, description in non_leaves + leaves:
term_to_description['EC:' + term] = description
return term_to_description
def parse_full_ec_file_to_transitive_parenthood(
enzyme_dat_file_contents,
enzyme_class_file_contents,
):
"""Parses enzyme.dat [1] and enzclass.txt [3] into transitive parenthood dict.
[1] ftp://ftp.expasy.org/databases/enzyme/enzyme.dat
[2] ftp://ftp.expasy.org/databases/enzyme/enzuser.txt
[3] ftp://ftp.expasy.org/databases/enzyme/enzclass.txt
Args:
enzyme_dat_file_contents: Text of file at [1]. Follows format at [2].
Contains only information about leaf nodes of the EC hierarchy (labels
with no hyphens; e.g. 1.2.3.4).
enzyme_class_file_contents: Text of file at [3]. Contains only information
about non-leaf nodes of the EC hierarchy (e.g. 1.2.3.-).
Returns:
Dict of all EC numbers to each of their parents. The values themselves
are included in their parent set. Note that [1] only includes leaf nodes;
this function includes all members of the tree as keys in the return value.
Parents include a root node called -.-.-.- that indicates that this example
is an enzyme.
Output keys include the prefix "EC:".
"""
leaf_node_labels = _get_leaf_node_ec_labels_from_file_contents(
enzyme_dat_file_contents)
non_leaf_node_labels = _get_non_leaf_node_ec_labels_from_file_contents(
enzyme_class_file_contents)
id_to_transitive_parents = {}
for label, _ in tqdm.tqdm(leaf_node_labels + non_leaf_node_labels):
parents_of_label = _all_ec_parents_for_label(label)
# Also add parents themselves as labels in the dictionary.
for parent_of_label in parents_of_label:
if parent_of_label not in id_to_transitive_parents:
rhs = set('EC:' + x for x in _all_ec_parents_for_label(parent_of_label))
id_to_transitive_parents['EC:' + parent_of_label] = rhs
return id_to_transitive_parents
def get_applicable_label_dict(
path = APPLICABLE_LABEL_JSON_PATH):
return utils.load_gz_json(path)
def reverse_map(
applicable_label_dict,
label_vocab = None):
"""Flip parenthood dict to map parents to children.
Args:
applicable_label_dict: e.g. output of get_applicable_label_dict.
label_vocab: e.g. output of inference_lib.vocab_from_model_base_path
Returns:
collections.defaultdict of k, v where:
k: originally the values in applicable_label_dict
v: originally the keys in applicable_label_dict.
The defaultdict returns an empty frozenset for keys that are not found.
This behavior is desirable for lifted clan label normalizers, where
keys may not imply themselves.
"""
# This is technically the entire transitive closure, so it is safe for DAGs
# (e.g. GO labels).
children = collections.defaultdict(set)
for child, parents in applicable_label_dict.items():
# Avoid adding children which don't appear in the vocab.
if label_vocab is None or child in label_vocab:
for parent in parents:
children[parent].add(child)
children = {k: frozenset(v) for k, v in children.items()}
return collections.defaultdict(frozenset, children.items())
def is_implied_by_something_else(
current_label,
reversed_normalizer,
all_labels_for_protein,
):
"""Returns whether the current label is implied by other labels for protein.
Args:
current_label: label about which we're asking "is this implied by some other
label for this protein?"
reversed_normalizer: output of reverse_map(label_normalizer). Helps this
function run fast.
all_labels_for_protein: set of all labels given to protein.
Returns:
bool
"""
all_labels_for_protein_without_current = all_labels_for_protein - frozenset(
[current_label])
children_of_current_label = reversed_normalizer[current_label]
# Most labels imply themselves; remove.
children_of_current_label = children_of_current_label - frozenset(
[current_label])
return len( # pylint: disable=g-explicit-length-test
children_of_current_label.intersection(
all_labels_for_protein_without_current)) > 0
def _filter_label_set_to_most_specific(
label_set,
reversed_normalizer):
"""Filters label set to most specific.
Args:
label_set: set of all labels given to protein.
reversed_normalizer: output of reverse_map(label_normalizer). Helps this
function run fast.
Returns:
Filtered set of labels.
"""
return frozenset([
l for l in label_set
if not is_implied_by_something_else(l, reversed_normalizer, label_set)
])
def filter_labels_to_most_specific(
df,
normalizer,
column_to_filter = 'predicted_label',
):
"""Filter labels given to each protein to the most specific label.
Useful for labels like GO, where we predict a ton of labels, and we only
want to look at the most informative labels.
Args:
df: pd.DataFrame with column `column_to_filter`.
normalizer: label normalizer.
column_to_filter: name of column in df.
Returns:
pd.DataFrame with column `column_to_filter`.
"""
reversed_normalizer = reverse_map(normalizer)
working_df = df.copy()
working_df[column_to_filter] = working_df[column_to_filter].apply(
lambda label_set: _filter_label_set_to_most_specific( # pylint: disable=g-long-lambda
label_set, reversed_normalizer))
return working_df