Skip to content

Commit

Permalink
Better debugging and more docs.
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcelloPerathoner committed Sep 23, 2023
1 parent 1337fc6 commit 8a4ab71
Show file tree
Hide file tree
Showing 60 changed files with 2,535 additions and 1,222 deletions.
6 changes: 2 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["pypy3.8", "pypy3.9", "3.9", "3.10", "3.11"]
python-version: ["pypy3.9", "3.9", "3.10", "3.11", "3.12"]

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand All @@ -30,5 +30,3 @@ jobs:
- name: Run tox
run: |
tox -e py
#
43 changes: 22 additions & 21 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,70 +1,71 @@
.PHONY: lint test dist upload docs

BIN=.venv/bin
DIRS=src/ tests/unit/ docs/ scripts/ # tests/performance/
BROWSER=firefox
PYTEST=pytest --doctest-modules --doctest-glob="*.rst" --doctest-ignore-import-errors

all: lint test

black:
-black $(DIRS)
-$(BIN)/black $(DIRS)

blackdoc:
-blackdoc $(DIRS)
-$(BIN)/blackdoc $(DIRS)

pylint:
-pylint src/
-$(BIN)/pylint src/

mypy:
-mypy $(DIRS)
-$(BIN)/mypy $(DIRS)

doc8:
-doc8 README.rst

pydocstyle:
pydocstyle src/
$(BIN)/pydocstyle src/

lint: black blackdoc pylint mypy pydocstyle

test:
python3 -m $(PYTEST) src/ tests/ docs/ README.rst
$(BIN)/python3 -m $(PYTEST) src/ tests/ docs/ README.rst

test-performance:
python3 -m $(PYTEST) --performance tests/performance/
$(BIN)/python3 -m $(PYTEST) --performance tests/performance/

coverage:
coverage erase
coverage run --branch --source=src -m $(PYTEST) tests/
coverage run --append --branch --source=src -m $(PYTEST) --debug-mode tests/
coverage report
coverage html
$(BIN)/coverage erase
$(BIN)/coverage run --branch --source=src -m $(PYTEST) tests/
$(BIN)/coverage run --append --branch --source=src -m $(PYTEST) --debug-mode tests/
$(BIN)/coverage report
$(BIN)/coverage html
$(BROWSER) htmlcov/index.html

profile:
python3 -O -m scripts.profile
$(BIN)/python3 -O -m scripts.profile

docs:
cd docs; make html

badges: coverage
python docs/make_badges.py
$(BIN)/python docs/make_badges.py

tox:
tox
$(BIN)/tox

dist: clean test coverage badges
python3 -m build
twine check dist/*
$(BIN)/python3 -m build
$(BIN)/twine check dist/*

upload: dist
twine check dist/*
twine upload dist/*
$(BIN)/twine check dist/*
$(BIN)/twine upload dist/*

install:
pip3 install --force-reinstall -e .
$(BIN)/pip3 install --force-reinstall -e .

uninstall:
pip3 uninstall super_collator
$(BIN)/pip3 uninstall super_collator

clean:
-rm -rf dist build *.egg-info
Expand Down
6 changes: 4 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ Super Collator

.. |py311| image:: docs/_images/tox-py311.svg

.. |pypy38| image:: docs/_images/tox-pypy38.svg
.. |py312| image:: docs/_images/tox-py312.svg

.. |pypy39| image:: docs/_images/tox-pypy39.svg

.. |coverage| image:: docs/_images/coverage.svg

|py39| |py310| |py311| |pypy38| |coverage|
|py39| |py310| |py311| |py312| |pypy39| |coverage|

Collates textual sources with relaxed spelling. Uses Gotoh's variant of the
Needleman-Wunsch sequence alignment algorithm.
Expand Down
2 changes: 1 addition & 1 deletion docs/_build/.buildinfo
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: f33361227c63b5df6ab434ec7e4ac92e
config: 7ae0d97b33a5ecf92644856b784e478b
tags: 645f666f9bcd5a90fca523b33c5a78b7
93 changes: 93 additions & 0 deletions docs/_build/_sources/algorithm.rst.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
Collation Algorithm
~~~~~~~~~~~~~~~~~~~

The library uses an enhancement of the Needleman-Wunsch algorithm by Gotoh [Gotoh1982]_.
This section provides a very high level overview of the algorithm.


Phase 1 - Build Table
---------------------

In phase 1 the algorithm builds a table. For example this is the table built for the
two strings: *the quick brown fox jumps over the lazy dog* and *sick fox is crazy.*

.. raw:: html
:file: _static/super-collator-phase1.html

Every cell in the table contains three values: `D`, `P`, and `Q`, and an arrow, like this:

.. raw:: html
:align: center

<table class='super-collator super-collator-debug-matrix' style="margin-left: auto; margin-right: auto">
<tr><td class='outer'>
<table>
<tr><td class='d inner'>D</td><td class='p inner'>P</td></tr>
<tr><td class='q inner'>Q</td><td class='inner arrow'>↖</td></tr>
</table>
</td>
</tr>
</table>

We define the score `S` for each cell as:

.. math::
S = \max(D, P, Q)
The grayed cells in the first row and first column are initialized using the *gap start*
and *gap extension* penalties. The numbers for each remaining cell are calculated using
only values from the three cells, to the top-left, the top, and the left, of the current
cell:

.. math::
D = S_↖ + \mbox{similarity}(word_←, word_↑)
.. math::
P = \max(S_↑ + openingpenalty, P_↑ + extensionpenalty)
.. math::
Q = \max(S_← + openingpenalty, Q_← + extensionpenalty)
Finally the arrow in the current cell is set to point to that cell which yielded the
highest of the current cell's `D`, `P`, and `Q` values.


Phase 2 - Backtrack
-------------------

When the table is thus completed, two empty sequences are created. Then the algorithm
starts backtracking from the last (bottom-right) cell following the arrows until it
reaches the first (top-left) cell. If the arrow points:

the word in the row header is added to the first sequence, a hyphen is added to the
second sequence,
the word in the row header is added to the first sequence, the word in the column
header is added to the second sequence,
a hyphen is added to the first sequence, the word in the column header is added to the
second sequence.

.. raw:: html
:file: _static/super-collator-phase2.html

Finally the two sequences are reversed and printed.

.. raw:: html
:file: _static/super-collator-result.html


Parameters
----------

The algorithm can be customized by setting:

- a word comparison (similarity) function,
- the starting gap penalty,
- the gap opening penalty,
- and the gap extension penalty.
2 changes: 1 addition & 1 deletion docs/_build/_sources/examples.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Install:
$ pip install super-collator
Align with relaxed spelling:
Align two strings with relaxed spelling using N-Grams:

.. code-block:: python
Expand Down
1 change: 1 addition & 0 deletions docs/_build/_sources/index.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Needleman-Wunsch sequence alignment algorithm.
:caption: Contents:

examples
algorithm
aligner
ngrams
references
Expand Down
9 changes: 5 additions & 4 deletions docs/_build/_sources/references.rst.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
References
==========

.. [Gotoh1982] Gotoh, O. An Improved Algorithm for Matching Biological Sequences.
1982. J. Mol. Biol. 162, 705-708
.. [Gotoh1982] Gotoh, O. 1982. *An Improved Algorithm for Matching Biological
Sequences.* J. Mol. Biol. 162, 705-708
http://jaligner.sourceforge.net/references/gotoh1982.pdf
.. [NeedlemanWunsch1970] Needleman, S. Wunsch, C. A General Method Applicable to the
.. [NeedlemanWunsch1970] Needleman, S. Wunsch, C. 1970. *A General Method Applicable to the
Search for Similarities in the Amino Acid Sequence of Two
Proteins. 1970. J. Mol. Biol. 48, 443-453
Proteins.* J. Mol. Biol. 48, 443-453
123 changes: 123 additions & 0 deletions docs/_build/_static/_sphinx_javascript_frameworks_compat.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/* Compatability shim for jQuery and underscores.js.
*
* Copyright Sphinx contributors
* Released under the two clause BSD licence
*/

/**
* small helper function to urldecode strings
*
* See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL
*/
jQuery.urldecode = function(x) {
if (!x) {
return x
}
return decodeURIComponent(x.replace(/\+/g, ' '));
};

/**
* small helper function to urlencode strings
*/
jQuery.urlencode = encodeURIComponent;

/**
* This function returns the parsed url parameters of the
* current request. Multiple values per key are supported,
* it will always return arrays of strings for the value parts.
*/
jQuery.getQueryParameters = function(s) {
if (typeof s === 'undefined')
s = document.location.search;
var parts = s.substr(s.indexOf('?') + 1).split('&');
var result = {};
for (var i = 0; i < parts.length; i++) {
var tmp = parts[i].split('=', 2);
var key = jQuery.urldecode(tmp[0]);
var value = jQuery.urldecode(tmp[1]);
if (key in result)
result[key].push(value);
else
result[key] = [value];
}
return result;
};

/**
* highlight a given string on a jquery object by wrapping it in
* span elements with the given class name.
*/
jQuery.fn.highlightText = function(text, className) {
function highlight(node, addItems) {
if (node.nodeType === 3) {
var val = node.nodeValue;
var pos = val.toLowerCase().indexOf(text);
if (pos >= 0 &&
!jQuery(node.parentNode).hasClass(className) &&
!jQuery(node.parentNode).hasClass("nohighlight")) {
var span;
var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg");
if (isInSVG) {
span = document.createElementNS("http://www.w3.org/2000/svg", "tspan");
} else {
span = document.createElement("span");
span.className = className;
}
span.appendChild(document.createTextNode(val.substr(pos, text.length)));
node.parentNode.insertBefore(span, node.parentNode.insertBefore(
document.createTextNode(val.substr(pos + text.length)),
node.nextSibling));
node.nodeValue = val.substr(0, pos);
if (isInSVG) {
var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect");
var bbox = node.parentElement.getBBox();
rect.x.baseVal.value = bbox.x;
rect.y.baseVal.value = bbox.y;
rect.width.baseVal.value = bbox.width;
rect.height.baseVal.value = bbox.height;
rect.setAttribute('class', className);
addItems.push({
"parent": node.parentNode,
"target": rect});
}
}
}
else if (!jQuery(node).is("button, select, textarea")) {
jQuery.each(node.childNodes, function() {
highlight(this, addItems);
});
}
}
var addItems = [];
var result = this.each(function() {
highlight(this, addItems);
});
for (var i = 0; i < addItems.length; ++i) {
jQuery(addItems[i].parent).before(addItems[i].target);
}
return result;
};

/*
* backward compatibility for jQuery.browser
* This will be supported until firefox bug is fixed.
*/
if (!jQuery.browser) {
jQuery.uaMatch = function(ua) {
ua = ua.toLowerCase();

var match = /(chrome)[ \/]([\w.]+)/.exec(ua) ||
/(webkit)[ \/]([\w.]+)/.exec(ua) ||
/(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) ||
/(msie) ([\w.]+)/.exec(ua) ||
ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) ||
[];

return {
browser: match[ 1 ] || "",
version: match[ 2 ] || "0"
};
};
jQuery.browser = {};
jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true;
}
Loading

0 comments on commit 8a4ab71

Please sign in to comment.