From 1337fc616b37e753cf07e21551d697531237ec65 Mon Sep 17 00:00:00 2001 From: Marcello Perathoner Date: Tue, 8 Nov 2022 16:24:52 +0100 Subject: [PATCH] WIP --- .github/workflows/python-package.yml | 2 +- Makefile | 2 +- README.rst | 15 +-- docs/_build/_sources/examples.rst.txt | 83 ++++++++----- docs/_build/_sources/index.rst.txt | 3 +- docs/_build/_sources/ngrams.rst.txt | 4 + docs/_build/_sources/references.rst.txt | 3 - docs/_build/aligner.html | 35 ++++-- docs/_build/doctrees/aligner.doctree | Bin 55022 -> 62424 bytes docs/_build/doctrees/environment.pickle | Bin 24426 -> 18755 bytes docs/_build/doctrees/examples.doctree | Bin 7847 -> 9591 bytes docs/_build/doctrees/index.doctree | Bin 4789 -> 4774 bytes docs/_build/doctrees/ngrams.doctree | Bin 0 -> 15731 bytes docs/_build/doctrees/references.doctree | Bin 4450 -> 3919 bytes docs/_build/doctrees/token.doctree | Bin 53515 -> 2357 bytes docs/_build/examples.html | 86 +++++++++----- docs/_build/genindex.html | 129 ++++---------------- docs/_build/index.html | 6 +- docs/_build/ngrams.html | 150 ++++++++++++++++++++++++ docs/_build/objects.inv | Bin 711 -> 515 bytes docs/_build/py-modindex.html | 20 +--- docs/_build/references.html | 11 +- docs/_build/search.html | 3 +- docs/_build/searchindex.js | 2 +- docs/_build/token.html | 146 +---------------------- docs/_images/coverage.svg | 2 +- docs/_images/tox-py.svg | 1 + docs/examples.rst | 85 +++++++++----- docs/index.rst | 3 +- docs/ngrams.rst | 4 + docs/references.rst | 3 - docs/strategy.rst | 4 - docs/token.rst | 4 - pyproject.toml | 2 +- scripts/cli.py | 19 +-- src/super_collator/aligner.py | 72 ++++++++---- src/super_collator/ngrams.py | 39 ++++++ src/super_collator/strategy.py | 104 ---------------- src/super_collator/super_collator.py | 46 +++----- src/super_collator/token.py | 109 ----------------- tests/unit/test_aligner.py | 99 +++++++++------- tests/unit/test_aligner_pos.py | 38 ++++-- tests/unit/test_bitcount.py | 27 ----- tests/unit/test_multiple_align.py | 80 +++++++++++++ tests/unit/test_ngrams.py | 45 +++++++ tests/unit/test_strategy.py | 87 -------------- tests/unit/test_super_collator.py | 9 +- tests/unit/test_token.py | 25 ---- 48 files changed, 724 insertions(+), 883 deletions(-) create mode 100644 docs/_build/_sources/ngrams.rst.txt create mode 100644 docs/_build/doctrees/ngrams.doctree create mode 100644 docs/_build/ngrams.html create mode 100644 docs/_images/tox-py.svg create mode 100644 docs/ngrams.rst delete mode 100644 docs/strategy.rst delete mode 100644 docs/token.rst mode change 100644 => 100755 scripts/cli.py create mode 100644 src/super_collator/ngrams.py delete mode 100644 src/super_collator/strategy.py delete mode 100644 src/super_collator/token.py delete mode 100644 tests/unit/test_bitcount.py create mode 100644 tests/unit/test_multiple_align.py create mode 100644 tests/unit/test_ngrams.py delete mode 100644 tests/unit/test_strategy.py delete mode 100644 tests/unit/test_token.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index df53292..f273a40 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11-dev"] + python-version: ["pypy3.8", "pypy3.9", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 diff --git a/Makefile b/Makefile index ec219c1..42f6cac 100644 --- a/Makefile +++ b/Makefile @@ -52,7 +52,7 @@ badges: coverage tox: tox -dist: test coverage # badges +dist: clean test coverage badges python3 -m build twine check dist/* diff --git a/README.rst b/README.rst index 12d5cea..1228e78 100644 --- a/README.rst +++ b/README.rst @@ -23,18 +23,19 @@ Needleman-Wunsch sequence alignment algorithm. .. code-block:: python - >>> from super_collator.strategy import CommonNgramsStrategy - >>> from super_collator.token import SingleToken - >>> from super_collator.super_collator import align, to_table + >>> from super_collator.aligner import Aligner + >>> from super_collator.ngrams import NGrams + >>> from super_collator.super_collator import to_table + >>> aligner = Aligner(-0.5, -0.5, -0.5) >>> a = "Lorem ipsum dollar amat adipiscing elit" >>> b = "qui dolorem ipsum quia dolor sit amet consectetur adipisci velit" >>> - >>> a = [SingleToken(s) for s in a.split()] - >>> b = [SingleToken(s) for s in b.split()] + >>> a = [NGrams(s).load(s, 3) for s in a.split()] + >>> b = [NGrams(s).load(s, 3) for s in b.split()] >>> - >>> c, score = align(a, b, CommonNgramsStrategy(2)) - >>> print(to_table(c)) # doctest: +NORMALIZE_WHITESPACE + >>> a, b, score = aligner.align(a, b, NGrams.similarity, lambda: NGrams("-")) + >>> print(to_table(list(map(str, a)), list(map(str, b)))) # doctest: +NORMALIZE_WHITESPACE - Lorem ipsum - dollar - amat - adipiscing elit qui dolorem ipsum quia dolor sit amet consectetur adipisci velit diff --git a/docs/_build/_sources/examples.rst.txt b/docs/_build/_sources/examples.rst.txt index 483f6bf..333d0d5 100644 --- a/docs/_build/_sources/examples.rst.txt +++ b/docs/_build/_sources/examples.rst.txt @@ -12,41 +12,58 @@ Align with relaxed spelling: .. code-block:: python - >>> from super_collator.strategy import CommonNgramsStrategy - >>> from super_collator.token import SingleToken - >>> from super_collator.super_collator import align, to_table + >>> from super_collator.aligner import Aligner + >>> from super_collator.ngrams import NGrams + >>> from super_collator.super_collator import to_table + >>> aligner = Aligner(-0.5, -0.5, -0.5) >>> a = "Lorem ipsum dollar amat adipiscing elit" >>> b = "qui dolorem ipsum quia dolor sit amet consectetur adipisci velit" >>> - >>> a = [SingleToken(s) for s in a.split()] - >>> b = [SingleToken(s) for s in b.split()] + >>> a = [NGrams(s).load(s, 3) for s in a.split()] + >>> b = [NGrams(s).load(s, 3) for s in b.split()] >>> - >>> c, score = align(a, b, CommonNgramsStrategy(2)) - >>> print(to_table(c)) # doctest: +NORMALIZE_WHITESPACE + >>> a, b, score = aligner.align(a, b, NGrams.similarity, lambda: NGrams("-")) + >>> print(to_table(list(map(str, a)), list(map(str, b)))) # doctest: +NORMALIZE_WHITESPACE - Lorem ipsum - dollar - amat - adipiscing elit qui dolorem ipsum quia dolor sit amet consectetur adipisci velit -Multiple alignment: +Multiple alignment: We repeatedly align two lists of NGrams against each other. .. code-block:: python - >>> from super_collator.strategy import CommonNgramsStrategy - >>> from super_collator.token import SingleToken - >>> from super_collator.super_collator import align, to_table - + >>> from super_collator.aligner import Aligner + >>> from super_collator.ngrams import NGrams + >>> from super_collator.super_collator import to_table + >>> + >>> def similarity(aa, bb): + ... sim = float("-inf") + ... for a in aa: + ... for b in bb: + ... score = NGrams.similarity(a, b) + ... if score > sim: + ... sim = score + ... return sim + >>> + >>> def merge(aa, bb): + ... return [a + b for a, b in zip(aa, bb)] + >>> + >>> aligner = Aligner(-1.0, -0.5, -0.5) >>> a = "qui dolorem ipsum quia dolor sit amet consectetur adipisci velit" >>> b = "Lorem ipsum adipiscing" >>> c = "Lorem dollar amat elit" >>> - >>> a = [SingleToken(s) for s in a.split()] - >>> b = [SingleToken(s) for s in b.split()] - >>> c = [SingleToken(s) for s in c.split()] + >>> a = [[NGrams(s).load(s, 2)] for s in a.split()] + >>> b = [[NGrams(s).load(s, 2)] for s in b.split()] + >>> c = [[NGrams(s).load(s, 2)] for s in c.split()] >>> - >>> d, score = align(a, b, CommonNgramsStrategy(2)) - >>> e, score = align(d, c, CommonNgramsStrategy(2)) - >>> print(to_table(e)) # doctest: +NORMALIZE_WHITESPACE + >>> a, b, score = aligner.align(a, b, similarity, lambda: [NGrams("-")], lambda: [NGrams("-")]) + >>> ab = merge(a, b) + >>> ab, c, score = aligner.align(ab, c, similarity, lambda: [NGrams("-")] * 2, lambda: [NGrams("-")]) + >>> abc = merge(ab, c) + >>> + >>> print(to_table(*zip(*[[t.user_data for t in nn] for nn in abc]))) # doctest: +NORMALIZE_WHITESPACE qui dolorem ipsum quia dolor sit amet consectetur adipisci velit - Lorem ipsum - - - - - adipiscing - - Lorem - - dollar - amat - - elit @@ -56,21 +73,29 @@ Align two sentences using their part-of-speech tags only: .. code-block:: python - >>> from super_collator.strategy import Strategy - >>> from super_collator.token import SingleToken - >>> from super_collator.super_collator import align, to_table - - >>> class PosStrategy(Strategy): - ... def similarity(self, a, b): - ... return 1.0 if a.user_data == b.user_data else 0.0 + >>> from super_collator.aligner import Aligner + >>> from super_collator.super_collator import to_table + >>> + >>> class PosToken: + ... def __init__(self, s, pos): + ... self.s = s + ... self.pos = pos + ... + ... def __str__(self): + ... return self.s + ... + ... @staticmethod + ... def similarity(a, b): + ... return 1.0 if a.pos == b.pos else 0.0 >>> + >>> aligner = Aligner() >>> a = "it/PRP was/VBD a/DT dark/JJ and/CC stormy/JJ night/NN" >>> b = "it/PRP is/VBZ a/DT fine/JJ day/NN" >>> - >>> a = [SingleToken(*s.split("/")) for s in a.split()] - >>> b = [SingleToken(*s.split("/")) for s in b.split()] + >>> a = [PosToken(*s.split("/")) for s in a.split()] + >>> b = [PosToken(*s.split("/")) for s in b.split()] >>> - >>> c, score = align(a, b, PosStrategy()) - >>> print(to_table(c)) # doctest: +NORMALIZE_WHITESPACE + >>> c, d, score = aligner.align(a, b, PosToken.similarity, lambda: PosToken("-", "")) + >>> print(to_table(list(map(str, c)), list(map(str, d)))) # doctest: +NORMALIZE_WHITESPACE it was a dark and stormy night it is a fine - - day diff --git a/docs/_build/_sources/index.rst.txt b/docs/_build/_sources/index.rst.txt index 843964b..e0c3e2e 100644 --- a/docs/_build/_sources/index.rst.txt +++ b/docs/_build/_sources/index.rst.txt @@ -11,8 +11,7 @@ Needleman-Wunsch sequence alignment algorithm. examples aligner - strategy - token + ngrams references diff --git a/docs/_build/_sources/ngrams.rst.txt b/docs/_build/_sources/ngrams.rst.txt new file mode 100644 index 0000000..0f60e6c --- /dev/null +++ b/docs/_build/_sources/ngrams.rst.txt @@ -0,0 +1,4 @@ +NGrams +------ + +.. automodule:: super_collator.ngrams diff --git a/docs/_build/_sources/references.rst.txt b/docs/_build/_sources/references.rst.txt index c22f346..f4cb5b2 100644 --- a/docs/_build/_sources/references.rst.txt +++ b/docs/_build/_sources/references.rst.txt @@ -7,6 +7,3 @@ References .. [NeedlemanWunsch1970] Needleman, S. Wunsch, C. A General Method Applicable to the Search for Similarities in the Amino Acid Sequence of Two Proteins. 1970. J. Mol. Biol. 48, 443-453 - -.. [Warren2013] Warren, H.S. Hacker's Delight. - 2nd Edition 2013. Pearson, Westford, MA. diff --git a/docs/_build/aligner.html b/docs/_build/aligner.html index 8a146e1..78d2c85 100644 --- a/docs/_build/aligner.html +++ b/docs/_build/aligner.html @@ -22,7 +22,7 @@ - + @@ -46,8 +46,7 @@ @@ -128,8 +127,17 @@ Gotoh’s technique the gap weight formula must be of the special form \(w_k = uk + v\) (affine gap). \(k\) is the gap size, \(v\) is the gap opening score and \(u\) the gap extension score.

-

The aligner is type-agnostic and expects only to call the method -Strategy.similarity() on the given strategy.

+

The aligner is type-agnostic. When the aligner wants to compare two objects, it +calls the method similarity() with both objects as arguments. This method +should return the score of the alignment. The score should increase with the +desirability of the alignment, but otherwise there are no fixed rules.

+

The score must harmonize with the penalties for inserting gaps. If the score for +opening a gap is -1.0 (the default) then a satisfactory match should return a score +> 1.0.

+

The similarity() function may consult a PAM or BLOSUM matrix, or compute a +hamming distance between the arguments. It may also use auxiliary data like +Part-of-Speech tags. In this case the data type aligned could be a dict containing +the word and the POS-tag.

See also

[NeedlemanWunsch1970]

@@ -161,11 +169,18 @@
-align(strategy: super_collator.strategy.Strategy[super_collator.token.TT], tokens_a: Sequence[super_collator.token.Token[super_collator.token.TT]], tokens_b: Sequence[super_collator.token.Token[super_collator.token.TT]]) Tuple[Sequence[super_collator.token.Token[super_collator.token.TT]], float]
+align(seq_a: Sequence[object], seq_b: Sequence[object], similarity: Callable[[object, object], float], gap_a: Optional[Callable[[], object]] = None, gap_b: Optional[Callable[[], object]] = None) Tuple[Sequence[object], Sequence[object], float]

Align two sequences.

-
Returns
-

the aligned sequence (of MultiTokens) and the score

+
Parameters
+
    +
  • similarity – a callable that returns the similarity of two objects

  • +
  • gap_a – insert gap_a() for a gap in sequence a. None inserts None.

  • +
  • gap_b – insert gap_b() for a gap in sequence b. None inserts gap_a().

  • +
+
+
Returns
+

the aligned sequences and the score

@@ -174,7 +189,7 @@
-build_debug_matrix(matrix: List[List[super_collator.aligner.Data]], len_matrix: List[List[int]], ts_a: Sequence[super_collator.token.Token[super_collator.token.TT]], ts_b: Sequence[super_collator.token.Token[super_collator.token.TT]]) str
+build_debug_matrix(matrix: List[List[super_collator.aligner.Data]], len_matrix: List[List[int]], ts_a: Sequence[object], ts_b: Sequence[object]) str

Build a human-readable debug matrix.

Parameters
@@ -198,7 +213,7 @@