Skip to content

Commit

Permalink
MNT/DOC: rm old stuff; doc new fill match cells
Browse files Browse the repository at this point in the history
i am sure it is possible to avoid creating "matches" and instead
update the COO data directly, but palindrome matches make this
tricky (per the comment that remains in lines 241-246 in _make.py).
hmm - i guess since the COO data is filled in in a certain order
we could maybe use binary search or something to detect if a match
already exists ...? but that will get pretty involved
  • Loading branch information
fedarko committed Oct 21, 2024
1 parent 5cd0888 commit e3682d1
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 183 deletions.
204 changes: 31 additions & 173 deletions wotplot/_make.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pydivsufsort
from pydivsufsort import divsufsort
from ._scipy_sm_constructor_getter import get_sm_constructor
from ._logging import get_logger

Expand Down Expand Up @@ -62,19 +61,6 @@ def _validate_yorder(yorder):
raise ValueError("yorder must be 'BT' or 'TB'")


def _get_suffix_array(seq):
# We convert the seq to bytes (same as done in pydivsufsort here:
# https://github.com/louisabraham/pydivsufsort/blob/f4431ee1ea96ee5caf579d9b9e4764636d9cfef1/pydivsufsort/divsufsort.py#L73)
# in order to prevent a warning about it having to convert the seq to bytes
# for us. Ideally we wouldn't even work with strings at all (or we'd do
# this conversion at the start of _make() and use bytes from there on), but
# I don't really feel like making that change r/n and I don't think it'll
# make a big difference compared to this tool's other inefficiencies.
# Although it is still a TODO worth noting (converting both s2 and rc(s2)
# to bytes separately makes me feel gross).
return divsufsort(seq.encode("ascii") + ENDCHAR)


def _get_row(position_in_s2, num_rows, yorder):
"""Converts a position in the sequence s2 to a row index in the matrix.
Expand Down Expand Up @@ -117,61 +103,19 @@ def _get_row(position_in_s2, num_rows, yorder):
)


def _fill_match_cells(
s1, s2, k, cs, md, yorder="BT", binary=True, s2isrc=False
):
num_rows = len(s2) - k + 1
for match_run in cs:
num_cells_matched = match_run[2] - k + 1
for i in range(num_cells_matched):
x = match_run[0] + i
s2p = match_run[1] + i
if s2isrc:
s2p = len(s2) - s2p - k
y = _get_row(s2p, num_rows, yorder)
pos = (y, x)
if not binary:
if s2isrc:
if pos in md:
md[pos] = BOTH
else:
md[pos] = REV
else:
md[pos] = FWD
else:
md[pos] = MATCH


def _fill_match_cells_old(
s1, s2, k, s1_sa, s2_sa, md, yorder="BT", binary=True, s2isrc=False
):
"""Finds the start positions of shared k-mers in two strings.
Does this using suffix arrays, which makes this more memory-efficient than
a naive approach.
This used to just return the matching positions, but now its main "output"
is updating md (a dict that maps matrix position --> match types). This is
faster than outputting things from here in a nice, easy-to-read format and
then having to waste time converting that :(
def _fill_match_cells(s1, s2, k, md, yorder="BT", binary=True, s2isrc=False):
"""Populates a dict describing k-mer matches between two strings.
Parameters
----------
s1: str
s2: str
The strings in which we'll search for shared k-mers. We assume that
both of these strings have lengths >= k. These strings should NOT
contain the ENDCHAR character.
both of these strings have lengths >= k.
k: int
k-mer size.
s1_sa: np.ndarray
Suffix array for (s1 + ENDCHAR).
s2_sa: np.ndarray
Suffix array for (s2 + ENDCHAR).
md: dict of (int, int) --> int
We'll update this dict with keys of the format (p2, p1) (indicating
that a matching k-mer exists at position p1 in s1 and position p2 in
Expand All @@ -196,113 +140,35 @@ def _fill_match_cells_old(
None
(The main "side effect" of this function is updating md; see above.)
Notes
-----
See wotplot/tests/test_make_utils.py for an example of using this function.
Historical context: I used to have a doctest here, but then numpy 2
changed how np.int32 types (present in pydivsufsort's output suffix arrays)
were represented from "7" to "np.int32(7)", which broke the doctest for
systems running numpy 2. I couldn't find a good general doctest solution
that worked for both numpy < 2 and numpy 2 (and wasn't ugly).
Just for reference, this sort of problem w/r/t numpy 2 is documented in
https://github.com/OSGeo/grass/issues/4100, and the general brittleness
inherent to doctests is discussed at https://stackoverflow.com/q/13473971.
References
----------
This is done using pydivsufsort.common_substrings(), which is quite fast.
(Previously, my code used pydivsufsort.divsufsort() to create two suffix
arrays for both strings, and then iterated through these arrays to find
matches. The common_substrings() algorithm is better.) See
https://github.com/louisabraham/pydivsufsort/issues/42 for details.
"""
# i and j are indices in the two suffix arrays. we'll set them to 1 in
# order to skip the first entry in the suffix array (which will always
# correspond to the suffix of just ENDCHAR, i.e. "$").
i = 1
j = 1
last_kmer_index_1 = len(s1) - k
last_kmer_index_2 = len(s2) - k
cs = pydivsufsort.common_substrings(s1, s2, limit=k)
num_rows = len(s2) - k + 1
while i < len(s1_sa) and j < len(s2_sa):
p1 = s1_sa[i]
p2 = s2_sa[j]
# Since there are only n - k + 1 k-mers in a string of length n, we can
# ignore the last (k - 1) positions in the string -- there are suffixes
# that start here, but these suffixes have length < k so we don't care
# about them.
if p1 > last_kmer_index_1:
i += 1
continue
if p2 > last_kmer_index_2:
j += 1
continue
# if we've made it here, then we know that both i and j correspond to
# indices of suffixes in the two strings that each contain at least k
# characters
k1 = s1[p1 : p1 + k]
k2 = s2[p2 : p2 + k]
if k1 == k2:
# "Descend" through s1 and s2, identifying all suffixes where the
# beginning k-mer matches k1 (and k2, but now we know k1 == k2 so I
# just use k1 for clarity's sake).
#
# NOTE: In the second checks that these while loops make (looking
# at the beginning k-mer of next_i or next_j), there's the
# possibility that the positions given by s1_sa[next_i] or
# s2_sa[next_j] occur after last_kmer_index_1 or last_kmer_index_2,
# respectively. In these cases, the beginning "k-mer" will be cut
# off by the end of the string, and will thus by definition be
# unequal to k1. We could add more explicit checks here, but I'm
# not sure that the added clarity would be worth the potential
# slight performance hit. (In big sequences, most positions are ...
# not near the end of the sequence. damn they should give me a
# fields medal for that Math Wisdom i just brought into the world.)
next_i = i + 1
while (
next_i < len(s1_sa)
and s1[s1_sa[next_i] : s1_sa[next_i] + k] == k1
):
next_i += 1

next_j = j + 1
while (
next_j < len(s2_sa)
and s2[s2_sa[next_j] : s2_sa[next_j] + k] == k1
):
next_j += 1

# Ok, now we know the "span" of this k-mer in both strings' suffix
# arrays -- in s1_sa, it's range(i, next_i).
# (If this k-mer only occurs once in s1, then next_i = i + 1: so
# list(range(i, next_i)) == [i].)
#
# We'll add each match to matches, then jump to just past the ends
# of these "spans."
for mi in range(i, next_i):
x = s1_sa[mi]
for mj in range(j, next_j):
if s2isrc:
s2p = len(s2) - s2_sa[mj] - k
else:
s2p = s2_sa[mj]
y = _get_row(s2p, num_rows, yorder)
pos = (y, x)
if not binary:
if s2isrc:
if pos in md:
md[pos] = BOTH
else:
md[pos] = REV
else:
md[pos] = FWD
for match_run in cs:
num_cells_matched = match_run[2] - k + 1
for i in range(num_cells_matched):
x = match_run[0] + i
s2p = match_run[1] + i
if s2isrc:
s2p = len(s2) - s2p - k
y = _get_row(s2p, num_rows, yorder)
pos = (y, x)
if not binary:
if s2isrc:
if pos in md:
md[pos] = BOTH
else:
md[pos] = MATCH
i = next_i
j = next_j
else:
# find lexicographically smaller suffix
# (We can safely make this comparison using k1 and k2 as proxies
# for their entire suffix, because we will only end up in this
# branch if k1 != k2)
if k1 < k2:
i += 1
md[pos] = REV
else:
md[pos] = FWD
else:
j += 1
md[pos] = MATCH


def _make(s1, s2, k, yorder="BT", binary=True, verbose=False):
Expand Down Expand Up @@ -350,27 +216,19 @@ def _make(s1, s2, k, yorder="BT", binary=True, verbose=False):
# final row or column). Interestingly, Figure 6.20 in Bioinformatics
# Algorithms does include this extra empty space, but we'll omit it here
mat_shape = (len(s2) - k + 1, len(s1) - k + 1)

# TODO: could probably avoid creating "matches" at all, and just populate
# the matrix directly using the output of common_substrings().
matches = {}

_mlog("finding forward matches between s1 and s2...")
csfwd = pydivsufsort.common_substrings(s1, s2, limit=k)
_fill_match_cells(s1, s2, k, csfwd, matches, yorder=yorder, binary=binary)
_fill_match_cells(s1, s2, k, matches, yorder=yorder, binary=binary)
_mlog(f"found {len(matches):,} forward match cell(s).")
# I'm not sure if this makes a difference (is Python smart enough to
# immediately garbage-collect this at this point?), but let's be clear
del csfwd

_mlog("computing ReverseComplement(s2)...")
rcs2 = rc(s2)
_mlog("finding reverse-complementary matches between s1 and s2...")
csrev = pydivsufsort.common_substrings(s1, rcs2, limit=k)
_fill_match_cells(
s1, rcs2, k, csrev, matches, yorder=yorder, binary=binary, s2isrc=True
s1, rcs2, k, matches, yorder=yorder, binary=binary, s2isrc=True
)
_mlog(f"found {len(matches):,} total match cell(s).")
del csrev

density = 100 * (len(matches) / (mat_shape[0] * mat_shape[1]))
_mlog(f"density = {density:.2f}%.")
Expand Down
2 changes: 0 additions & 2 deletions wotplot/tests/test_make.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import os
import pytest
import pyfastx
import numpy as np
from wotplot import DotPlotMatrix, MATCH, FWD, REV, BOTH

Expand Down
10 changes: 2 additions & 8 deletions wotplot/tests/test_make_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import pydivsufsort
import pytest
from wotplot._make import (
rc,
_validate_and_stringify_seq,
_validate_k,
_validate_yorder,
_get_row,
_get_suffix_array,
_fill_match_cells,
)

Expand Down Expand Up @@ -111,10 +109,9 @@ def test_get_row_position_geq_num_rows():
def test_fill_match_cells():
s1 = "ACGTC"
s2 = "AAGTCAC"
csfwd = pydivsufsort.common_substrings(s1, s2, 2)
md = {}

_fill_match_cells(s1, s2, 2, csfwd, md, yorder="TB", binary=False)
_fill_match_cells(s1, s2, 2, md, yorder="TB", binary=False)
# Note that the coordinates' types will be set based on the types present
# within the suffix array -- so repr(md) will, for numpy 2, be equal to
# {(np.int32(5), np.int32(0)): FWD, ...}, at least as of writing.
Expand All @@ -124,10 +121,7 @@ def test_fill_match_cells():

# "Extend" md with reverse-complementary matches
s2r = rc(s2)
csrev = pydivsufsort.common_substrings(s1, s2r, 2)
_fill_match_cells(
s1, s2r, 2, csrev, md, yorder="TB", binary=False, s2isrc=True
)
_fill_match_cells(s1, s2r, 2, md, yorder="TB", binary=False, s2isrc=True)
assert md == {
(5, 0): FWD,
(2, 2): FWD,
Expand Down

0 comments on commit e3682d1

Please sign in to comment.