From b93c7d44804ad0a92f30f62798ba63c9a23086cf Mon Sep 17 00:00:00 2001 From: chbk Date: Thu, 22 Sep 2022 18:11:26 +0200 Subject: [PATCH] Fix monoexonic transcripts filtering --- bin/filter_rare_transcripts.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bin/filter_rare_transcripts.py b/bin/filter_rare_transcripts.py index 06084f3..b73d6f0 100755 --- a/bin/filter_rare_transcripts.py +++ b/bin/filter_rare_transcripts.py @@ -156,11 +156,15 @@ ['chromosome', 'strand', 'start', 'end'] ) -mono['group'] = ( - (mono['chromosome'] != mono['chromosome'].shift()) | - (mono['strand'] != mono['strand'].shift()) | - (mono['start'] > mono['end'].shift()) -).cumsum() +previous_end_max = mono.set_index(['chromosome', 'strand'])['end'].groupby( + ['chromosome', 'strand'], + observed = True +).shift().groupby( + ['chromosome', 'strand'], + observed = True +).cummax().fillna(-1).to_numpy() + +mono['group'] = (mono['start'] > previous_end_max).cumsum() mono = mono.set_index(['chromosome', 'strand', 'group'])