MNT/DOC: rm old stuff; doc new fill match cells

i am sure it is possible to avoid creating "matches" and instead update the COO data directly, but palindrome matches make this tricky (per the comment that remains in lines 241-246 in _make.py). hmm - i guess since the COO data is filled in in a certain order we could maybe use binary search or something to detect if a match already exists ...? but that will get pretty involved
fedarko · Oct 21, 2024 · e3682d1 · e3682d1
1 parent 5cd0888
commit e3682d1
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 183 deletions.
diff --git a/wotplot/_make.py b/wotplot/_make.py
@@ -1,5 +1,4 @@
 import pydivsufsort
-from pydivsufsort import divsufsort
 from ._scipy_sm_constructor_getter import get_sm_constructor
 from ._logging import get_logger
 
@@ -62,19 +61,6 @@ def _validate_yorder(yorder):
         raise ValueError("yorder must be 'BT' or 'TB'")
 
 
-def _get_suffix_array(seq):
-    # We convert the seq to bytes (same as done in pydivsufsort here:
-    # https://github.com/louisabraham/pydivsufsort/blob/f4431ee1ea96ee5caf579d9b9e4764636d9cfef1/pydivsufsort/divsufsort.py#L73)
-    # in order to prevent a warning about it having to convert the seq to bytes
-    # for us. Ideally we wouldn't even work with strings at all (or we'd do
-    # this conversion at the start of _make() and use bytes from there on), but
-    # I don't really feel like making that change r/n and I don't think it'll
-    # make a big difference compared to this tool's other inefficiencies.
-    # Although it is still a TODO worth noting (converting both s2 and rc(s2)
-    # to bytes separately makes me feel gross).
-    return divsufsort(seq.encode("ascii") + ENDCHAR)
-
-
 def _get_row(position_in_s2, num_rows, yorder):
     """Converts a position in the sequence s2 to a row index in the matrix.
 
@@ -117,61 +103,19 @@ def _get_row(position_in_s2, num_rows, yorder):
         )
 
 
-def _fill_match_cells(
-    s1, s2, k, cs, md, yorder="BT", binary=True, s2isrc=False
-):
-    num_rows = len(s2) - k + 1
-    for match_run in cs:
-        num_cells_matched = match_run[2] - k + 1
-        for i in range(num_cells_matched):
-            x = match_run[0] + i
-            s2p = match_run[1] + i
-            if s2isrc:
-                s2p = len(s2) - s2p - k
-            y = _get_row(s2p, num_rows, yorder)
-            pos = (y, x)
-            if not binary:
-                if s2isrc:
-                    if pos in md:
-                        md[pos] = BOTH
-                    else:
-                        md[pos] = REV
-                else:
-                    md[pos] = FWD
-            else:
-                md[pos] = MATCH
-
-
-def _fill_match_cells_old(
-    s1, s2, k, s1_sa, s2_sa, md, yorder="BT", binary=True, s2isrc=False
-):
-    """Finds the start positions of shared k-mers in two strings.
-
-    Does this using suffix arrays, which makes this more memory-efficient than
-    a naive approach.
-
-    This used to just return the matching positions, but now its main "output"
-    is updating md (a dict that maps matrix position --> match types). This is
-    faster than outputting things from here in a nice, easy-to-read format and
-    then having to waste time converting that :(
+def _fill_match_cells(s1, s2, k, md, yorder="BT", binary=True, s2isrc=False):
+    """Populates a dict describing k-mer matches between two strings.
 
     Parameters
     ----------
     s1: str
     s2: str
         The strings in which we'll search for shared k-mers. We assume that
-        both of these strings have lengths >= k. These strings should NOT
-        contain the ENDCHAR character.
+        both of these strings have lengths >= k.
 
     k: int
         k-mer size.
 
-    s1_sa: np.ndarray
-        Suffix array for (s1 + ENDCHAR).
-
-    s2_sa: np.ndarray
-        Suffix array for (s2 + ENDCHAR).
-
     md: dict of (int, int) --> int
         We'll update this dict with keys of the format (p2, p1) (indicating
         that a matching k-mer exists at position p1 in s1 and position p2 in
@@ -196,113 +140,35 @@ def _fill_match_cells_old(
     None
         (The main "side effect" of this function is updating md; see above.)
 
-    Notes
-    -----
-    See wotplot/tests/test_make_utils.py for an example of using this function.
-
-    Historical context: I used to have a doctest here, but then numpy 2
-    changed how np.int32 types (present in pydivsufsort's output suffix arrays)
-    were represented from "7" to "np.int32(7)", which broke the doctest for
-    systems running numpy 2. I couldn't find a good general doctest solution
-    that worked for both numpy < 2 and numpy 2 (and wasn't ugly).
-
-    Just for reference, this sort of problem w/r/t numpy 2 is documented in
-    https://github.com/OSGeo/grass/issues/4100, and the general brittleness
-    inherent to doctests is discussed at https://stackoverflow.com/q/13473971.
+    References
+    ----------
+    This is done using pydivsufsort.common_substrings(), which is quite fast.
+    (Previously, my code used pydivsufsort.divsufsort() to create two suffix
+    arrays for both strings, and then iterated through these arrays to find
+    matches. The common_substrings() algorithm is better.) See
+    https://github.com/louisabraham/pydivsufsort/issues/42 for details.
     """
-    # i and j are indices in the two suffix arrays. we'll set them to 1 in
-    # order to skip the first entry in the suffix array (which will always
-    # correspond to the suffix of just ENDCHAR, i.e. "$").
-    i = 1
-    j = 1
-    last_kmer_index_1 = len(s1) - k
-    last_kmer_index_2 = len(s2) - k
+    cs = pydivsufsort.common_substrings(s1, s2, limit=k)
     num_rows = len(s2) - k + 1
-    while i < len(s1_sa) and j < len(s2_sa):
-        p1 = s1_sa[i]
-        p2 = s2_sa[j]
-        # Since there are only n - k + 1 k-mers in a string of length n, we can
-        # ignore the last (k - 1) positions in the string -- there are suffixes
-        # that start here, but these suffixes have length < k so we don't care
-        # about them.
-        if p1 > last_kmer_index_1:
-            i += 1
-            continue
-        if p2 > last_kmer_index_2:
-            j += 1
-            continue
-        # if we've made it here, then we know that both i and j correspond to
-        # indices of suffixes in the two strings that each contain at least k
-        # characters
-        k1 = s1[p1 : p1 + k]
-        k2 = s2[p2 : p2 + k]
-        if k1 == k2:
-            # "Descend" through s1 and s2, identifying all suffixes where the
-            # beginning k-mer matches k1 (and k2, but now we know k1 == k2 so I
-            # just use k1 for clarity's sake).
-            #
-            # NOTE: In the second checks that these while loops make (looking
-            # at the beginning k-mer of next_i or next_j), there's the
-            # possibility that the positions given by s1_sa[next_i] or
-            # s2_sa[next_j] occur after last_kmer_index_1 or last_kmer_index_2,
-            # respectively. In these cases, the beginning "k-mer" will be cut
-            # off by the end of the string, and will thus by definition be
-            # unequal to k1. We could add more explicit checks here, but I'm
-            # not sure that the added clarity would be worth the potential
-            # slight performance hit. (In big sequences, most positions are ...
-            # not near the end of the sequence. damn they should give me a
-            # fields medal for that Math Wisdom i just brought into the world.)
-            next_i = i + 1
-            while (
-                next_i < len(s1_sa)
-                and s1[s1_sa[next_i] : s1_sa[next_i] + k] == k1
-            ):
-                next_i += 1
-
-            next_j = j + 1
-            while (
-                next_j < len(s2_sa)
-                and s2[s2_sa[next_j] : s2_sa[next_j] + k] == k1
-            ):
-                next_j += 1
-
-            # Ok, now we know the "span" of this k-mer in both strings' suffix
-            # arrays -- in s1_sa, it's range(i, next_i).
-            # (If this k-mer only occurs once in s1, then next_i = i + 1: so
-            # list(range(i, next_i)) == [i].)
-            #
-            # We'll add each match to matches, then jump to just past the ends
-            # of these "spans."
-            for mi in range(i, next_i):
-                x = s1_sa[mi]
-                for mj in range(j, next_j):
-                    if s2isrc:
-                        s2p = len(s2) - s2_sa[mj] - k
-                    else:
-                        s2p = s2_sa[mj]
-                    y = _get_row(s2p, num_rows, yorder)
-                    pos = (y, x)
-                    if not binary:
-                        if s2isrc:
-                            if pos in md:
-                                md[pos] = BOTH
-                            else:
-                                md[pos] = REV
-                        else:
-                            md[pos] = FWD
+    for match_run in cs:
+        num_cells_matched = match_run[2] - k + 1
+        for i in range(num_cells_matched):
+            x = match_run[0] + i
+            s2p = match_run[1] + i
+            if s2isrc:
+                s2p = len(s2) - s2p - k
+            y = _get_row(s2p, num_rows, yorder)
+            pos = (y, x)
+            if not binary:
+                if s2isrc:
+                    if pos in md:
+                        md[pos] = BOTH
                     else:
-                        md[pos] = MATCH
-            i = next_i
-            j = next_j
-        else:
-            # find lexicographically smaller suffix
-            # (We can safely make this comparison using k1 and k2 as proxies
-            # for their entire suffix, because we will only end up in this
-            # branch if k1 != k2)
-            if k1 < k2:
-                i += 1
+                        md[pos] = REV
+                else:
+                    md[pos] = FWD
             else:
-                j += 1
+                md[pos] = MATCH
 
 
 def _make(s1, s2, k, yorder="BT", binary=True, verbose=False):
@@ -350,27 +216,19 @@ def _make(s1, s2, k, yorder="BT", binary=True, verbose=False):
     # final row or column). Interestingly, Figure 6.20 in Bioinformatics
     # Algorithms does include this extra empty space, but we'll omit it here
     mat_shape = (len(s2) - k + 1, len(s1) - k + 1)
-
-    # TODO: could probably avoid creating "matches" at all, and just populate
-    # the matrix directly using the output of common_substrings().
     matches = {}
+
     _mlog("finding forward matches between s1 and s2...")
-    csfwd = pydivsufsort.common_substrings(s1, s2, limit=k)
-    _fill_match_cells(s1, s2, k, csfwd, matches, yorder=yorder, binary=binary)
+    _fill_match_cells(s1, s2, k, matches, yorder=yorder, binary=binary)
     _mlog(f"found {len(matches):,} forward match cell(s).")
-    # I'm not sure if this makes a difference (is Python smart enough to
-    # immediately garbage-collect this at this point?), but let's be clear
-    del csfwd
 
     _mlog("computing ReverseComplement(s2)...")
     rcs2 = rc(s2)
     _mlog("finding reverse-complementary matches between s1 and s2...")
-    csrev = pydivsufsort.common_substrings(s1, rcs2, limit=k)
     _fill_match_cells(
-        s1, rcs2, k, csrev, matches, yorder=yorder, binary=binary, s2isrc=True
+        s1, rcs2, k, matches, yorder=yorder, binary=binary, s2isrc=True
     )
     _mlog(f"found {len(matches):,} total match cell(s).")
-    del csrev
 
     density = 100 * (len(matches) / (mat_shape[0] * mat_shape[1]))
     _mlog(f"density = {density:.2f}%.")

diff --git a/wotplot/tests/test_make.py b/wotplot/tests/test_make.py
@@ -1,6 +1,4 @@
-import os
 import pytest
-import pyfastx
 import numpy as np
 from wotplot import DotPlotMatrix, MATCH, FWD, REV, BOTH
 

diff --git a/wotplot/tests/test_make_utils.py b/wotplot/tests/test_make_utils.py
@@ -1,12 +1,10 @@
-import pydivsufsort
 import pytest
 from wotplot._make import (
     rc,
     _validate_and_stringify_seq,
     _validate_k,
     _validate_yorder,
     _get_row,
-    _get_suffix_array,
     _fill_match_cells,
 )
 
@@ -111,10 +109,9 @@ def test_get_row_position_geq_num_rows():
 def test_fill_match_cells():
     s1 = "ACGTC"
     s2 = "AAGTCAC"
-    csfwd = pydivsufsort.common_substrings(s1, s2, 2)
     md = {}
 
-    _fill_match_cells(s1, s2, 2, csfwd, md, yorder="TB", binary=False)
+    _fill_match_cells(s1, s2, 2, md, yorder="TB", binary=False)
     # Note that the coordinates' types will be set based on the types present
     # within the suffix array -- so repr(md) will, for numpy 2, be equal to
     # {(np.int32(5), np.int32(0)): FWD, ...}, at least as of writing.
@@ -124,10 +121,7 @@ def test_fill_match_cells():
 
     # "Extend" md with reverse-complementary matches
     s2r = rc(s2)
-    csrev = pydivsufsort.common_substrings(s1, s2r, 2)
-    _fill_match_cells(
-        s1, s2r, 2, csrev, md, yorder="TB", binary=False, s2isrc=True
-    )
+    _fill_match_cells(s1, s2r, 2, md, yorder="TB", binary=False, s2isrc=True)
     assert md == {
         (5, 0): FWD,
         (2, 2): FWD,