From 1337fc616b37e753cf07e21551d697531237ec65 Mon Sep 17 00:00:00 2001
From: Marcello Perathoner
Date: Tue, 8 Nov 2022 16:24:52 +0100
Subject: [PATCH] WIP
---
.github/workflows/python-package.yml | 2 +-
Makefile | 2 +-
README.rst | 15 +--
docs/_build/_sources/examples.rst.txt | 83 ++++++++-----
docs/_build/_sources/index.rst.txt | 3 +-
docs/_build/_sources/ngrams.rst.txt | 4 +
docs/_build/_sources/references.rst.txt | 3 -
docs/_build/aligner.html | 35 ++++--
docs/_build/doctrees/aligner.doctree | Bin 55022 -> 62424 bytes
docs/_build/doctrees/environment.pickle | Bin 24426 -> 18755 bytes
docs/_build/doctrees/examples.doctree | Bin 7847 -> 9591 bytes
docs/_build/doctrees/index.doctree | Bin 4789 -> 4774 bytes
docs/_build/doctrees/ngrams.doctree | Bin 0 -> 15731 bytes
docs/_build/doctrees/references.doctree | Bin 4450 -> 3919 bytes
docs/_build/doctrees/token.doctree | Bin 53515 -> 2357 bytes
docs/_build/examples.html | 86 +++++++++-----
docs/_build/genindex.html | 129 ++++----------------
docs/_build/index.html | 6 +-
docs/_build/ngrams.html | 150 ++++++++++++++++++++++++
docs/_build/objects.inv | Bin 711 -> 515 bytes
docs/_build/py-modindex.html | 20 +---
docs/_build/references.html | 11 +-
docs/_build/search.html | 3 +-
docs/_build/searchindex.js | 2 +-
docs/_build/token.html | 146 +----------------------
docs/_images/coverage.svg | 2 +-
docs/_images/tox-py.svg | 1 +
docs/examples.rst | 85 +++++++++-----
docs/index.rst | 3 +-
docs/ngrams.rst | 4 +
docs/references.rst | 3 -
docs/strategy.rst | 4 -
docs/token.rst | 4 -
pyproject.toml | 2 +-
scripts/cli.py | 19 +--
src/super_collator/aligner.py | 72 ++++++++----
src/super_collator/ngrams.py | 39 ++++++
src/super_collator/strategy.py | 104 ----------------
src/super_collator/super_collator.py | 46 +++-----
src/super_collator/token.py | 109 -----------------
tests/unit/test_aligner.py | 99 +++++++++-------
tests/unit/test_aligner_pos.py | 38 ++++--
tests/unit/test_bitcount.py | 27 -----
tests/unit/test_multiple_align.py | 80 +++++++++++++
tests/unit/test_ngrams.py | 45 +++++++
tests/unit/test_strategy.py | 87 --------------
tests/unit/test_super_collator.py | 9 +-
tests/unit/test_token.py | 25 ----
48 files changed, 724 insertions(+), 883 deletions(-)
create mode 100644 docs/_build/_sources/ngrams.rst.txt
create mode 100644 docs/_build/doctrees/ngrams.doctree
create mode 100644 docs/_build/ngrams.html
create mode 100644 docs/_images/tox-py.svg
create mode 100644 docs/ngrams.rst
delete mode 100644 docs/strategy.rst
delete mode 100644 docs/token.rst
mode change 100644 => 100755 scripts/cli.py
create mode 100644 src/super_collator/ngrams.py
delete mode 100644 src/super_collator/strategy.py
delete mode 100644 src/super_collator/token.py
delete mode 100644 tests/unit/test_bitcount.py
create mode 100644 tests/unit/test_multiple_align.py
create mode 100644 tests/unit/test_ngrams.py
delete mode 100644 tests/unit/test_strategy.py
delete mode 100644 tests/unit/test_token.py
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index df53292..f273a40 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- python-version: ["3.9", "3.10", "3.11-dev"]
+ python-version: ["pypy3.8", "pypy3.9", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v3
diff --git a/Makefile b/Makefile
index ec219c1..42f6cac 100644
--- a/Makefile
+++ b/Makefile
@@ -52,7 +52,7 @@ badges: coverage
tox:
tox
-dist: test coverage # badges
+dist: clean test coverage badges
python3 -m build
twine check dist/*
diff --git a/README.rst b/README.rst
index 12d5cea..1228e78 100644
--- a/README.rst
+++ b/README.rst
@@ -23,18 +23,19 @@ Needleman-Wunsch sequence alignment algorithm.
.. code-block:: python
- >>> from super_collator.strategy import CommonNgramsStrategy
- >>> from super_collator.token import SingleToken
- >>> from super_collator.super_collator import align, to_table
+ >>> from super_collator.aligner import Aligner
+ >>> from super_collator.ngrams import NGrams
+ >>> from super_collator.super_collator import to_table
+ >>> aligner = Aligner(-0.5, -0.5, -0.5)
>>> a = "Lorem ipsum dollar amat adipiscing elit"
>>> b = "qui dolorem ipsum quia dolor sit amet consectetur adipisci velit"
>>>
- >>> a = [SingleToken(s) for s in a.split()]
- >>> b = [SingleToken(s) for s in b.split()]
+ >>> a = [NGrams(s).load(s, 3) for s in a.split()]
+ >>> b = [NGrams(s).load(s, 3) for s in b.split()]
>>>
- >>> c, score = align(a, b, CommonNgramsStrategy(2))
- >>> print(to_table(c)) # doctest: +NORMALIZE_WHITESPACE
+ >>> a, b, score = aligner.align(a, b, NGrams.similarity, lambda: NGrams("-"))
+ >>> print(to_table(list(map(str, a)), list(map(str, b)))) # doctest: +NORMALIZE_WHITESPACE
- Lorem ipsum - dollar - amat - adipiscing elit
qui dolorem ipsum quia dolor sit amet consectetur adipisci velit
diff --git a/docs/_build/_sources/examples.rst.txt b/docs/_build/_sources/examples.rst.txt
index 483f6bf..333d0d5 100644
--- a/docs/_build/_sources/examples.rst.txt
+++ b/docs/_build/_sources/examples.rst.txt
@@ -12,41 +12,58 @@ Align with relaxed spelling:
.. code-block:: python
- >>> from super_collator.strategy import CommonNgramsStrategy
- >>> from super_collator.token import SingleToken
- >>> from super_collator.super_collator import align, to_table
+ >>> from super_collator.aligner import Aligner
+ >>> from super_collator.ngrams import NGrams
+ >>> from super_collator.super_collator import to_table
+ >>> aligner = Aligner(-0.5, -0.5, -0.5)
>>> a = "Lorem ipsum dollar amat adipiscing elit"
>>> b = "qui dolorem ipsum quia dolor sit amet consectetur adipisci velit"
>>>
- >>> a = [SingleToken(s) for s in a.split()]
- >>> b = [SingleToken(s) for s in b.split()]
+ >>> a = [NGrams(s).load(s, 3) for s in a.split()]
+ >>> b = [NGrams(s).load(s, 3) for s in b.split()]
>>>
- >>> c, score = align(a, b, CommonNgramsStrategy(2))
- >>> print(to_table(c)) # doctest: +NORMALIZE_WHITESPACE
+ >>> a, b, score = aligner.align(a, b, NGrams.similarity, lambda: NGrams("-"))
+ >>> print(to_table(list(map(str, a)), list(map(str, b)))) # doctest: +NORMALIZE_WHITESPACE
- Lorem ipsum - dollar - amat - adipiscing elit
qui dolorem ipsum quia dolor sit amet consectetur adipisci velit
-Multiple alignment:
+Multiple alignment: We repeatedly align two lists of NGrams against each other.
.. code-block:: python
- >>> from super_collator.strategy import CommonNgramsStrategy
- >>> from super_collator.token import SingleToken
- >>> from super_collator.super_collator import align, to_table
-
+ >>> from super_collator.aligner import Aligner
+ >>> from super_collator.ngrams import NGrams
+ >>> from super_collator.super_collator import to_table
+ >>>
+ >>> def similarity(aa, bb):
+ ... sim = float("-inf")
+ ... for a in aa:
+ ... for b in bb:
+ ... score = NGrams.similarity(a, b)
+ ... if score > sim:
+ ... sim = score
+ ... return sim
+ >>>
+ >>> def merge(aa, bb):
+ ... return [a + b for a, b in zip(aa, bb)]
+ >>>
+ >>> aligner = Aligner(-1.0, -0.5, -0.5)
>>> a = "qui dolorem ipsum quia dolor sit amet consectetur adipisci velit"
>>> b = "Lorem ipsum adipiscing"
>>> c = "Lorem dollar amat elit"
>>>
- >>> a = [SingleToken(s) for s in a.split()]
- >>> b = [SingleToken(s) for s in b.split()]
- >>> c = [SingleToken(s) for s in c.split()]
+ >>> a = [[NGrams(s).load(s, 2)] for s in a.split()]
+ >>> b = [[NGrams(s).load(s, 2)] for s in b.split()]
+ >>> c = [[NGrams(s).load(s, 2)] for s in c.split()]
>>>
- >>> d, score = align(a, b, CommonNgramsStrategy(2))
- >>> e, score = align(d, c, CommonNgramsStrategy(2))
- >>> print(to_table(e)) # doctest: +NORMALIZE_WHITESPACE
+ >>> a, b, score = aligner.align(a, b, similarity, lambda: [NGrams("-")], lambda: [NGrams("-")])
+ >>> ab = merge(a, b)
+ >>> ab, c, score = aligner.align(ab, c, similarity, lambda: [NGrams("-")] * 2, lambda: [NGrams("-")])
+ >>> abc = merge(ab, c)
+ >>>
+ >>> print(to_table(*zip(*[[t.user_data for t in nn] for nn in abc]))) # doctest: +NORMALIZE_WHITESPACE
qui dolorem ipsum quia dolor sit amet consectetur adipisci velit
- Lorem ipsum - - - - - adipiscing -
- Lorem - - dollar - amat - - elit
@@ -56,21 +73,29 @@ Align two sentences using their part-of-speech tags only:
.. code-block:: python
- >>> from super_collator.strategy import Strategy
- >>> from super_collator.token import SingleToken
- >>> from super_collator.super_collator import align, to_table
-
- >>> class PosStrategy(Strategy):
- ... def similarity(self, a, b):
- ... return 1.0 if a.user_data == b.user_data else 0.0
+ >>> from super_collator.aligner import Aligner
+ >>> from super_collator.super_collator import to_table
+ >>>
+ >>> class PosToken:
+ ... def __init__(self, s, pos):
+ ... self.s = s
+ ... self.pos = pos
+ ...
+ ... def __str__(self):
+ ... return self.s
+ ...
+ ... @staticmethod
+ ... def similarity(a, b):
+ ... return 1.0 if a.pos == b.pos else 0.0
>>>
+ >>> aligner = Aligner()
>>> a = "it/PRP was/VBD a/DT dark/JJ and/CC stormy/JJ night/NN"
>>> b = "it/PRP is/VBZ a/DT fine/JJ day/NN"
>>>
- >>> a = [SingleToken(*s.split("/")) for s in a.split()]
- >>> b = [SingleToken(*s.split("/")) for s in b.split()]
+ >>> a = [PosToken(*s.split("/")) for s in a.split()]
+ >>> b = [PosToken(*s.split("/")) for s in b.split()]
>>>
- >>> c, score = align(a, b, PosStrategy())
- >>> print(to_table(c)) # doctest: +NORMALIZE_WHITESPACE
+ >>> c, d, score = aligner.align(a, b, PosToken.similarity, lambda: PosToken("-", ""))
+ >>> print(to_table(list(map(str, c)), list(map(str, d)))) # doctest: +NORMALIZE_WHITESPACE
it was a dark and stormy night
it is a fine - - day
diff --git a/docs/_build/_sources/index.rst.txt b/docs/_build/_sources/index.rst.txt
index 843964b..e0c3e2e 100644
--- a/docs/_build/_sources/index.rst.txt
+++ b/docs/_build/_sources/index.rst.txt
@@ -11,8 +11,7 @@ Needleman-Wunsch sequence alignment algorithm.
examples
aligner
- strategy
- token
+ ngrams
references
diff --git a/docs/_build/_sources/ngrams.rst.txt b/docs/_build/_sources/ngrams.rst.txt
new file mode 100644
index 0000000..0f60e6c
--- /dev/null
+++ b/docs/_build/_sources/ngrams.rst.txt
@@ -0,0 +1,4 @@
+NGrams
+------
+
+.. automodule:: super_collator.ngrams
diff --git a/docs/_build/_sources/references.rst.txt b/docs/_build/_sources/references.rst.txt
index c22f346..f4cb5b2 100644
--- a/docs/_build/_sources/references.rst.txt
+++ b/docs/_build/_sources/references.rst.txt
@@ -7,6 +7,3 @@ References
.. [NeedlemanWunsch1970] Needleman, S. Wunsch, C. A General Method Applicable to the
Search for Similarities in the Amino Acid Sequence of Two
Proteins. 1970. J. Mol. Biol. 48, 443-453
-
-.. [Warren2013] Warren, H.S. Hacker's Delight.
- 2nd Edition 2013. Pearson, Westford, MA.
diff --git a/docs/_build/aligner.html b/docs/_build/aligner.html
index 8a146e1..78d2c85 100644
--- a/docs/_build/aligner.html
+++ b/docs/_build/aligner.html
@@ -22,7 +22,7 @@
-
+
@@ -46,8 +46,7 @@
@@ -128,8 +127,17 @@
Gotoh’s technique the gap weight formula must be of the special form \(w_k = uk + v\)
(affine gap). \(k\) is the gap size, \(v\) is the gap opening score and \(u\) the gap
extension score.
-The aligner is type-agnostic and expects only to call the method
-Strategy.similarity()
on the given strategy.
+The aligner is type-agnostic. When the aligner wants to compare two objects, it
+calls the method similarity()
with both objects as arguments. This method
+should return the score of the alignment. The score should increase with the
+desirability of the alignment, but otherwise there are no fixed rules.
+The score must harmonize with the penalties for inserting gaps. If the score for
+opening a gap is -1.0 (the default) then a satisfactory match should return a score
+> 1.0.
+The similarity()
function may consult a PAM or BLOSUM matrix, or compute a
+hamming distance between the arguments. It may also use auxiliary data like
+Part-of-Speech tags. In this case the data type aligned could be a dict containing
+the word and the POS-tag.
See also
[NeedlemanWunsch1970]
@@ -161,11 +169,18 @@
-align ( strategy : super_collator.strategy.Strategy [ super_collator.token.TT ] , tokens_a : Sequence [ super_collator.token.Token [ super_collator.token.TT ] ] , tokens_b : Sequence [ super_collator.token.Token [ super_collator.token.TT ] ] ) → Tuple [ Sequence [ super_collator.token.Token [ super_collator.token.TT ] ] , float ]
+align ( seq_a : Sequence [ object ] , seq_b : Sequence [ object ] , similarity : Callable [ [ object , object ] , float ] , gap_a : Optional [ Callable [ [ ] , object ] ] = None , gap_b : Optional [ Callable [ [ ] , object ] ] = None ) → Tuple [ Sequence [ object ] , Sequence [ object ] , float ]
Align two sequences.
-Returns
-the aligned sequence (of MultiTokens) and the score
+Parameters
+
+similarity – a callable that returns the similarity of two objects
+gap_a – insert gap_a() for a gap in sequence a. None inserts None.
+gap_b – insert gap_b() for a gap in sequence b. None inserts gap_a().
+
+
+Returns
+the aligned sequences and the score
@@ -174,7 +189,7 @@
-build_debug_matrix ( matrix : List [ List [ super_collator.aligner.Data ] ] , len_matrix : List [ List [ int ] ] , ts_a : Sequence [ super_collator.token.Token [ super_collator.token.TT ] ] , ts_b : Sequence [ super_collator.token.Token [ super_collator.token.TT ] ] ) → str
+build_debug_matrix ( matrix : List [ List [ super_collator.aligner.Data ] ] , len_matrix : List [ List [ int ] ] , ts_a : Sequence [ object ] , ts_b : Sequence [ object ] ) → str
Build a human-readable debug matrix.
Parameters
@@ -198,7 +213,7 @@