diff --git a/bin/filter_rare_transcripts.py b/bin/filter_rare_transcripts.py index 06084f3..b73d6f0 100755 --- a/bin/filter_rare_transcripts.py +++ b/bin/filter_rare_transcripts.py @@ -156,11 +156,15 @@ ['chromosome', 'strand', 'start', 'end'] ) -mono['group'] = ( - (mono['chromosome'] != mono['chromosome'].shift()) | - (mono['strand'] != mono['strand'].shift()) | - (mono['start'] > mono['end'].shift()) -).cumsum() +previous_end_max = mono.set_index(['chromosome', 'strand'])['end'].groupby( + ['chromosome', 'strand'], + observed = True +).shift().groupby( + ['chromosome', 'strand'], + observed = True +).cummax().fillna(-1).to_numpy() + +mono['group'] = (mono['start'] > previous_end_max).cumsum() mono = mono.set_index(['chromosome', 'strand', 'group'])