diff --git a/CHANGES.rst b/CHANGES.rst index d3d041e5..bf338529 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -3,6 +3,14 @@ Changelog ========= +development version +------------------- +* Add a ``--max-average-error-rate``/``--max-aer`` option to add a filter + that checks if the number of expected errors divided by read length is above a + certain threshold. This expected errors are calculated the same as in + ``--max-expected-errors`` and dividing by read length helps for reads that + have varying lengths. + v4.4 (2023-04-28) ----------------- diff --git a/doc/algorithms.rst b/doc/algorithms.rst index 604c5ee8..17d3c81f 100644 --- a/doc/algorithms.rst +++ b/doc/algorithms.rst @@ -207,3 +207,9 @@ computed from the quality scores as described in the USEARCH paper by The USEARCH manual page `has a lot more background on expected errors `_ and how to choose a threshold. + +The ``--max-average-error-rate`` (short version: ``--max-aer``) option works +similarly but divides the expected errors by the length of the read. +The resulting fraction is then used to filter the read. This is especially +helpful on reads that have highly varying read length, such as those coming +from nanopore sequencing. diff --git a/doc/guide.rst b/doc/guide.rst index ca08fd5e..528fc1bb 100644 --- a/doc/guide.rst +++ b/doc/guide.rst @@ -1416,6 +1416,12 @@ reads. They always discard those reads for which the filtering criterion applies ``--max-expected-errors ERRORS`` or ``--max-ee ERRORS`` Discard reads with more than ERRORS :ref:`expected errors `. + +``--max-average-error-rate ERROR_RATE`` or ``--max-aer`` + Discard reads with more than ERROR_RATE average expected errors + (total expected errors divided by the read length). ERROR_RATE must + be between 0.0 and 1.0. + ``--discard-casava`` Discard reads that did not pass CASAVA filtering. Illumina’s CASAVA pipeline in version 1.8 adds an *is_filtered* header field to each read. Specifying this diff --git a/src/cutadapt/cli.py b/src/cutadapt/cli.py index ca2e9e9c..969ab631 100644 --- a/src/cutadapt/cli.py +++ b/src/cutadapt/cli.py @@ -296,6 +296,10 @@ def get_argument_parser() -> ArgumentParser: metavar="ERRORS", help="Discard reads whose expected number of errors (computed " "from quality values) exceeds ERRORS.") + group.add_argument("--max-average-error-rate", "--max-aer", type=float, default=None, + metavar="ERROR_RATE", + help="as --max-expected-errors (see above), but divided by length to " + "account for reads of varying length.") group.add_argument("--discard-trimmed", "--discard", action='store_true', default=False, help="Discard reads that contain an adapter. Use also -O to avoid " "discarding too many randomly matching reads.") @@ -904,6 +908,7 @@ def make(self) -> Pipeline: setattr(pipeline, attr, lengths) pipeline.max_n = self.args.max_n pipeline.max_expected_errors = self.args.max_expected_errors + pipeline.max_average_error_rate = self.args.max_average_error_rate pipeline.discard_casava = self.args.discard_casava pipeline.discard_trimmed = self.args.discard_trimmed pipeline.discard_untrimmed = self.args.discard_untrimmed diff --git a/src/cutadapt/pipeline.py b/src/cutadapt/pipeline.py index b842c234..5e0bdb27 100644 --- a/src/cutadapt/pipeline.py +++ b/src/cutadapt/pipeline.py @@ -30,6 +30,7 @@ TooLong, TooManyN, TooManyExpectedErrors, + TooHighAverageErrorRate, CasavaFiltered, DiscardTrimmed, ) @@ -72,6 +73,7 @@ def __init__(self) -> None: self._maximum_length = None self.max_n = None self.max_expected_errors = None + self.max_average_error_rate = None self.discard_casava = False self.discard_trimmed = False self.discard_untrimmed = False @@ -149,6 +151,15 @@ def _set_output(self, outfiles: OutputFiles) -> None: # noqa: C901 f1 = f2 = TooManyExpectedErrors(self.max_expected_errors) self._steps.append(self._make_filter(f1, f2, None)) + if self.max_average_error_rate is not None: + if not self._reader.delivers_qualities: + logger.warning( + "Ignoring option --max-er because input does not contain quality values" + ) + else: + f1 = f2 = TooHighAverageErrorRate(self.max_average_error_rate) + self._steps.append(self._make_filter(f1, f2, None)) + if self.discard_casava: f1 = f2 = CasavaFiltered() self._steps.append(self._make_filter(f1, f2, None)) diff --git a/src/cutadapt/predicates.py b/src/cutadapt/predicates.py index 4346efef..cd1f1de2 100644 --- a/src/cutadapt/predicates.py +++ b/src/cutadapt/predicates.py @@ -71,6 +71,27 @@ def test(self, read, info: ModificationInfo): return expected_errors(read.qualities) > self.max_errors +class TooHighAverageErrorRate(Predicate): + """ + Select reads that have an average error rate above the threshold. + This works better than TooManyExpectedErrors for reads that are expected to + have varying lengths, such as for long read sequencing technologies. + """ + + def __init__(self, max_error_rate: float): + if not 0.0 < max_error_rate < 1.0: + raise ValueError( + f"max_error_rate must be between 0.0 and 1.0, got {max_error_rate}." + ) + self.max_error_rate = max_error_rate + + def __repr__(self): + return f"TooHighAverageErrorRate(max_error_rate={self.max_error_rate}" + + def test(self, read, info: ModificationInfo): + return (expected_errors(read.qualities) / len(read)) > self.max_error_rate + + class TooManyN(Predicate): """ Select reads that have too many 'N' bases. diff --git a/tests/test_predicates.py b/tests/test_predicates.py index a8aef42d..4e94b81b 100644 --- a/tests/test_predicates.py +++ b/tests/test_predicates.py @@ -4,7 +4,7 @@ import pytest from dnaio import SequenceRecord -from cutadapt.predicates import TooManyN +from cutadapt.predicates import TooManyN, TooHighAverageErrorRate from cutadapt.steps import PairedEndFilter @@ -21,7 +21,6 @@ ], ) def test_too_many_n(seq, count, expected): - # third parameter is True if read should be Trueed predicate = TooManyN(count=count) _seq = SequenceRecord("read1", seq, qualities="#" * len(seq)) assert predicate.test(_seq, []) == expected @@ -53,3 +52,20 @@ def test_invalid_pair_filter_mode(): with pytest.raises(ValueError) as e: PairedEndFilter(None, None, None, "invalidmode") assert "pair_filter_mode must be" in e.value.args[0] + + +@pytest.mark.parametrize( + "quals,rate,expected", + [ + # 3 * 0.1 is larger than 0.3 due to floating point rounding. + (chr(43) * 3, 0.1, True), + (chr(43) * 3 + chr(33), 0.1, True), # 3 * 0.1 + 1 + (chr(43) * 3 + chr(33), 0.33, False), # 3 * 0.1 + 1 + (chr(43) * 3 + chr(33), 0.32, True), # 3 * 0.1 + 1 + (chr(126) * 9 + chr(33), 0.1, True), + ], +) +def test_too_high_average_error_rate(quals, rate, expected): + predicate = TooHighAverageErrorRate(rate) + _seq = SequenceRecord("read1", "A" * len(quals), qualities=quals) + assert predicate.test(_seq, []) == expected