Skip to content

Commit

Permalink
Merge pull request #725 from rhpvorderman/av-ee
Browse files Browse the repository at this point in the history
Add an option for maximum average error rate (--max-aer)
  • Loading branch information
marcelm authored Sep 6, 2023
2 parents cefb3e0 + d7491a3 commit a5f1d69
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 2 deletions.
8 changes: 8 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
Changelog
=========

development version
-------------------
* Add a ``--max-average-error-rate``/``--max-aer`` option to add a filter
that checks if the number of expected errors divided by read length is above a
certain threshold. This expected errors are calculated the same as in
``--max-expected-errors`` and dividing by read length helps for reads that
have varying lengths.

v4.4 (2023-04-28)
-----------------

Expand Down
6 changes: 6 additions & 0 deletions doc/algorithms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,9 @@ computed from the quality scores as described in the USEARCH paper by
The USEARCH manual page `has a lot more background on expected
errors <https://www.drive5.com/usearch/manual/exp_errs.html>`_ and how to choose
a threshold.

The ``--max-average-error-rate`` (short version: ``--max-aer``) option works
similarly but divides the expected errors by the length of the read.
The resulting fraction is then used to filter the read. This is especially
helpful on reads that have highly varying read length, such as those coming
from nanopore sequencing.
6 changes: 6 additions & 0 deletions doc/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1416,6 +1416,12 @@ reads. They always discard those reads for which the filtering criterion applies
``--max-expected-errors ERRORS`` or ``--max-ee ERRORS``
Discard reads with more than ERRORS :ref:`expected errors <expected-errors>`.


``--max-average-error-rate ERROR_RATE`` or ``--max-aer``
Discard reads with more than ERROR_RATE average expected errors
(total expected errors divided by the read length). ERROR_RATE must
be between 0.0 and 1.0.

``--discard-casava``
Discard reads that did not pass CASAVA filtering. Illumina’s CASAVA pipeline in
version 1.8 adds an *is_filtered* header field to each read. Specifying this
Expand Down
5 changes: 5 additions & 0 deletions src/cutadapt/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,10 @@ def get_argument_parser() -> ArgumentParser:
metavar="ERRORS",
help="Discard reads whose expected number of errors (computed "
"from quality values) exceeds ERRORS.")
group.add_argument("--max-average-error-rate", "--max-aer", type=float, default=None,
metavar="ERROR_RATE",
help="as --max-expected-errors (see above), but divided by length to "
"account for reads of varying length.")
group.add_argument("--discard-trimmed", "--discard", action='store_true', default=False,
help="Discard reads that contain an adapter. Use also -O to avoid "
"discarding too many randomly matching reads.")
Expand Down Expand Up @@ -904,6 +908,7 @@ def make(self) -> Pipeline:
setattr(pipeline, attr, lengths)
pipeline.max_n = self.args.max_n
pipeline.max_expected_errors = self.args.max_expected_errors
pipeline.max_average_error_rate = self.args.max_average_error_rate
pipeline.discard_casava = self.args.discard_casava
pipeline.discard_trimmed = self.args.discard_trimmed
pipeline.discard_untrimmed = self.args.discard_untrimmed
Expand Down
11 changes: 11 additions & 0 deletions src/cutadapt/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
TooLong,
TooManyN,
TooManyExpectedErrors,
TooHighAverageErrorRate,
CasavaFiltered,
DiscardTrimmed,
)
Expand Down Expand Up @@ -72,6 +73,7 @@ def __init__(self) -> None:
self._maximum_length = None
self.max_n = None
self.max_expected_errors = None
self.max_average_error_rate = None
self.discard_casava = False
self.discard_trimmed = False
self.discard_untrimmed = False
Expand Down Expand Up @@ -149,6 +151,15 @@ def _set_output(self, outfiles: OutputFiles) -> None: # noqa: C901
f1 = f2 = TooManyExpectedErrors(self.max_expected_errors)
self._steps.append(self._make_filter(f1, f2, None))

if self.max_average_error_rate is not None:
if not self._reader.delivers_qualities:
logger.warning(
"Ignoring option --max-er because input does not contain quality values"
)
else:
f1 = f2 = TooHighAverageErrorRate(self.max_average_error_rate)
self._steps.append(self._make_filter(f1, f2, None))

if self.discard_casava:
f1 = f2 = CasavaFiltered()
self._steps.append(self._make_filter(f1, f2, None))
Expand Down
21 changes: 21 additions & 0 deletions src/cutadapt/predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,27 @@ def test(self, read, info: ModificationInfo):
return expected_errors(read.qualities) > self.max_errors


class TooHighAverageErrorRate(Predicate):
"""
Select reads that have an average error rate above the threshold.
This works better than TooManyExpectedErrors for reads that are expected to
have varying lengths, such as for long read sequencing technologies.
"""

def __init__(self, max_error_rate: float):
if not 0.0 < max_error_rate < 1.0:
raise ValueError(
f"max_error_rate must be between 0.0 and 1.0, got {max_error_rate}."
)
self.max_error_rate = max_error_rate

def __repr__(self):
return f"TooHighAverageErrorRate(max_error_rate={self.max_error_rate}"

def test(self, read, info: ModificationInfo):
return (expected_errors(read.qualities) / len(read)) > self.max_error_rate


class TooManyN(Predicate):
"""
Select reads that have too many 'N' bases.
Expand Down
20 changes: 18 additions & 2 deletions tests/test_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from dnaio import SequenceRecord

from cutadapt.predicates import TooManyN
from cutadapt.predicates import TooManyN, TooHighAverageErrorRate
from cutadapt.steps import PairedEndFilter


Expand All @@ -21,7 +21,6 @@
],
)
def test_too_many_n(seq, count, expected):
# third parameter is True if read should be Trueed
predicate = TooManyN(count=count)
_seq = SequenceRecord("read1", seq, qualities="#" * len(seq))
assert predicate.test(_seq, []) == expected
Expand Down Expand Up @@ -53,3 +52,20 @@ def test_invalid_pair_filter_mode():
with pytest.raises(ValueError) as e:
PairedEndFilter(None, None, None, "invalidmode")
assert "pair_filter_mode must be" in e.value.args[0]


@pytest.mark.parametrize(
"quals,rate,expected",
[
# 3 * 0.1 is larger than 0.3 due to floating point rounding.
(chr(43) * 3, 0.1, True),
(chr(43) * 3 + chr(33), 0.1, True), # 3 * 0.1 + 1
(chr(43) * 3 + chr(33), 0.33, False), # 3 * 0.1 + 1
(chr(43) * 3 + chr(33), 0.32, True), # 3 * 0.1 + 1
(chr(126) * 9 + chr(33), 0.1, True),
],
)
def test_too_high_average_error_rate(quals, rate, expected):
predicate = TooHighAverageErrorRate(rate)
_seq = SequenceRecord("read1", "A" * len(quals), qualities=quals)
assert predicate.test(_seq, []) == expected

0 comments on commit a5f1d69

Please sign in to comment.