Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcelloPerathoner committed Nov 8, 2022
1 parent 0262b6c commit 1337fc6
Show file tree
Hide file tree
Showing 48 changed files with 724 additions and 883 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11-dev"]
python-version: ["pypy3.8", "pypy3.9", "3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ badges: coverage
tox:
tox

dist: test coverage # badges
dist: clean test coverage badges
python3 -m build
twine check dist/*

Expand Down
15 changes: 8 additions & 7 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,19 @@ Needleman-Wunsch sequence alignment algorithm.
.. code-block:: python
>>> from super_collator.strategy import CommonNgramsStrategy
>>> from super_collator.token import SingleToken
>>> from super_collator.super_collator import align, to_table
>>> from super_collator.aligner import Aligner
>>> from super_collator.ngrams import NGrams
>>> from super_collator.super_collator import to_table
>>> aligner = Aligner(-0.5, -0.5, -0.5)
>>> a = "Lorem ipsum dollar amat adipiscing elit"
>>> b = "qui dolorem ipsum quia dolor sit amet consectetur adipisci velit"
>>>
>>> a = [SingleToken(s) for s in a.split()]
>>> b = [SingleToken(s) for s in b.split()]
>>> a = [NGrams(s).load(s, 3) for s in a.split()]
>>> b = [NGrams(s).load(s, 3) for s in b.split()]
>>>
>>> c, score = align(a, b, CommonNgramsStrategy(2))
>>> print(to_table(c)) # doctest: +NORMALIZE_WHITESPACE
>>> a, b, score = aligner.align(a, b, NGrams.similarity, lambda: NGrams("-"))
>>> print(to_table(list(map(str, a)), list(map(str, b)))) # doctest: +NORMALIZE_WHITESPACE
- Lorem ipsum - dollar - amat - adipiscing elit
qui dolorem ipsum quia dolor sit amet consectetur adipisci velit
Expand Down
83 changes: 54 additions & 29 deletions docs/_build/_sources/examples.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,41 +12,58 @@ Align with relaxed spelling:

.. code-block:: python
>>> from super_collator.strategy import CommonNgramsStrategy
>>> from super_collator.token import SingleToken
>>> from super_collator.super_collator import align, to_table
>>> from super_collator.aligner import Aligner
>>> from super_collator.ngrams import NGrams
>>> from super_collator.super_collator import to_table
>>> aligner = Aligner(-0.5, -0.5, -0.5)
>>> a = "Lorem ipsum dollar amat adipiscing elit"
>>> b = "qui dolorem ipsum quia dolor sit amet consectetur adipisci velit"
>>>
>>> a = [SingleToken(s) for s in a.split()]
>>> b = [SingleToken(s) for s in b.split()]
>>> a = [NGrams(s).load(s, 3) for s in a.split()]
>>> b = [NGrams(s).load(s, 3) for s in b.split()]
>>>
>>> c, score = align(a, b, CommonNgramsStrategy(2))
>>> print(to_table(c)) # doctest: +NORMALIZE_WHITESPACE
>>> a, b, score = aligner.align(a, b, NGrams.similarity, lambda: NGrams("-"))
>>> print(to_table(list(map(str, a)), list(map(str, b)))) # doctest: +NORMALIZE_WHITESPACE
- Lorem ipsum - dollar - amat - adipiscing elit
qui dolorem ipsum quia dolor sit amet consectetur adipisci velit
Multiple alignment:
Multiple alignment: We repeatedly align two lists of NGrams against each other.

.. code-block:: python
>>> from super_collator.strategy import CommonNgramsStrategy
>>> from super_collator.token import SingleToken
>>> from super_collator.super_collator import align, to_table
>>> from super_collator.aligner import Aligner
>>> from super_collator.ngrams import NGrams
>>> from super_collator.super_collator import to_table
>>>
>>> def similarity(aa, bb):
... sim = float("-inf")
... for a in aa:
... for b in bb:
... score = NGrams.similarity(a, b)
... if score > sim:
... sim = score
... return sim
>>>
>>> def merge(aa, bb):
... return [a + b for a, b in zip(aa, bb)]
>>>
>>> aligner = Aligner(-1.0, -0.5, -0.5)
>>> a = "qui dolorem ipsum quia dolor sit amet consectetur adipisci velit"
>>> b = "Lorem ipsum adipiscing"
>>> c = "Lorem dollar amat elit"
>>>
>>> a = [SingleToken(s) for s in a.split()]
>>> b = [SingleToken(s) for s in b.split()]
>>> c = [SingleToken(s) for s in c.split()]
>>> a = [[NGrams(s).load(s, 2)] for s in a.split()]
>>> b = [[NGrams(s).load(s, 2)] for s in b.split()]
>>> c = [[NGrams(s).load(s, 2)] for s in c.split()]
>>>
>>> d, score = align(a, b, CommonNgramsStrategy(2))
>>> e, score = align(d, c, CommonNgramsStrategy(2))
>>> print(to_table(e)) # doctest: +NORMALIZE_WHITESPACE
>>> a, b, score = aligner.align(a, b, similarity, lambda: [NGrams("-")], lambda: [NGrams("-")])
>>> ab = merge(a, b)
>>> ab, c, score = aligner.align(ab, c, similarity, lambda: [NGrams("-")] * 2, lambda: [NGrams("-")])
>>> abc = merge(ab, c)
>>>
>>> print(to_table(*zip(*[[t.user_data for t in nn] for nn in abc]))) # doctest: +NORMALIZE_WHITESPACE
qui dolorem ipsum quia dolor sit amet consectetur adipisci velit
- Lorem ipsum - - - - - adipiscing -
- Lorem - - dollar - amat - - elit
Expand All @@ -56,21 +73,29 @@ Align two sentences using their part-of-speech tags only:

.. code-block:: python
>>> from super_collator.strategy import Strategy
>>> from super_collator.token import SingleToken
>>> from super_collator.super_collator import align, to_table
>>> class PosStrategy(Strategy):
... def similarity(self, a, b):
... return 1.0 if a.user_data == b.user_data else 0.0
>>> from super_collator.aligner import Aligner
>>> from super_collator.super_collator import to_table
>>>
>>> class PosToken:
... def __init__(self, s, pos):
... self.s = s
... self.pos = pos
...
... def __str__(self):
... return self.s
...
... @staticmethod
... def similarity(a, b):
... return 1.0 if a.pos == b.pos else 0.0
>>>
>>> aligner = Aligner()
>>> a = "it/PRP was/VBD a/DT dark/JJ and/CC stormy/JJ night/NN"
>>> b = "it/PRP is/VBZ a/DT fine/JJ day/NN"
>>>
>>> a = [SingleToken(*s.split("/")) for s in a.split()]
>>> b = [SingleToken(*s.split("/")) for s in b.split()]
>>> a = [PosToken(*s.split("/")) for s in a.split()]
>>> b = [PosToken(*s.split("/")) for s in b.split()]
>>>
>>> c, score = align(a, b, PosStrategy())
>>> print(to_table(c)) # doctest: +NORMALIZE_WHITESPACE
>>> c, d, score = aligner.align(a, b, PosToken.similarity, lambda: PosToken("-", ""))
>>> print(to_table(list(map(str, c)), list(map(str, d)))) # doctest: +NORMALIZE_WHITESPACE
it was a dark and stormy night
it is a fine - - day
3 changes: 1 addition & 2 deletions docs/_build/_sources/index.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ Needleman-Wunsch sequence alignment algorithm.

examples
aligner
strategy
token
ngrams
references


Expand Down
4 changes: 4 additions & 0 deletions docs/_build/_sources/ngrams.rst.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
NGrams
------

.. automodule:: super_collator.ngrams
3 changes: 0 additions & 3 deletions docs/_build/_sources/references.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,3 @@ References
.. [NeedlemanWunsch1970] Needleman, S. Wunsch, C. A General Method Applicable to the
Search for Similarities in the Amino Acid Sequence of Two
Proteins. 1970. J. Mol. Biol. 48, 443-453
.. [Warren2013] Warren, H.S. Hacker's Delight.
2nd Edition 2013. Pearson, Westford, MA.
Loading

0 comments on commit 1337fc6

Please sign in to comment.