From 35916172362359f5b18c8c0a0b13697ff9cfb3cb Mon Sep 17 00:00:00 2001 From: Laurent Gautier Date: Sat, 30 Dec 2023 15:51:18 -0500 Subject: [PATCH] Add CI script + lint code. --- .github/workflows/package.yml | 128 +++++++++++++++++++++++++++++++++ src/minhashsketch.py | 8 +-- src/parallel.py | 28 ++++---- src/tests/test__murmurhash3.py | 5 +- src/tests/test_parallel.py | 100 ++++++++++++++++++-------- 5 files changed, 219 insertions(+), 50 deletions(-) create mode 100644 .github/workflows/package.yml diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml new file mode 100644 index 0000000..9777e0d --- /dev/null +++ b/.github/workflows/package.yml @@ -0,0 +1,128 @@ +name: Python package + +on: [push, pull_request] + +jobs: + code-qc: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Lint with flake8 + run: | + python -m pip install flake8 + flake8 src/ + build-sdist: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Set up R ${{ matrix.r-version }} + uses: r-lib/actions/setup-r@v2 + with: + r-version: release + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install setuptools build + - name: Build sdist + run: | + python -m build -o dist/ --sdist . + - name: Upload source package. + uses: actions/upload-artifact@v3 + with: + name: mashing-pumkins-sdist + path: dist/mashing-pumpkins-*.tar.gz + build-wheel-posix: + runs-on: ${{ matrix.os }} + permissions: + packages: read + continue-on-error: true + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-latest + python: 39 + platform_id: manylinux2014 + - os: macos-11 + python: 310 + platform_id: macosx_x86_64 + - os: macos-11 + python: 310 + platform_id: macosx_arm64 + - os: macos-12 + python: 310 + platform_id: macosx_arm64 + - os: macos-13 + python: 310 + platform_id: macosx_arm64 + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install setuptools build cibuildwheel + - name: Build wheel + uses: pypa/cibuildwheel@v2.16.2 + env: + CIBW_SKIP: cp36-* cp37-* + CIBW_ARCHS_LINUX: "auto aarch64" + CIBW_ARCHS_MACOS: "x86_64 arm64" + CIBW_BUILD_VERBOSITY: 1 + with: + output-dir: wheelhouse + config-file: pyproject.toml + - name: Upload wheels + uses: actions/upload-artifact@v3 + with: + name: binary-wheels-${{ matrix.platform_id }} + path: wheelhouse/*.whl + test: + needs: [build-sdist] + runs-on: ${{ matrix.os }} + continue-on-error: ${{ matrix.experimental }} + strategy: + max-parallel: 4 + matrix: + python-version: [3.8, 3.9, "3.10", "3.11"] + r-version: ['release'] + os: [ubuntu-latest, ubuntu-20.04, macOS-latest] + venv_activate: ["source pyenv_base/bin/activate"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Create base virtualenv (non-windows) + run: | + python -m venv pyenv_base + ${{ matrix.venv_activate }} + python -m pip install -U pip wheel + - uses: actions/download-artifact@v3 + with: + name: mashing-pumpkins-sdist + path: dist/ + - name: Source package path. + shell: bash + run: echo "SRC_DIST=$(ls -1 dist/*.tar.gz | tail -n 1)" >> $GITHUB_ENV + - name: Install package (non-Windows) + run: | + ${{ matrix.venv_activate }} + pip install $SRC_DIST + - name: Test + run: | + ${{ matrix.venv_activate }} + python -m pip install $SRC_DIST'[test_minimal]' + pytest src/tests/ diff --git a/src/minhashsketch.py b/src/minhashsketch.py index aa70dec..54d1332 100644 --- a/src/minhashsketch.py +++ b/src/minhashsketch.py @@ -70,7 +70,7 @@ def _minmaxhash_add_ngrams( # Replace the maximum value in the heap. heapmap[h] = elt out = heapreplace(heap, elt) - del(heapmap[sign * out[0]]) + del (heapmap[sign * out[0]]) # The negative of the hash is needed for MinHash. heaptop = sign * heap[0][0] if anynew is not None: @@ -172,7 +172,7 @@ def _replace(self, h, elt): heapmap = self._heapmap heapmap[h] = elt out = heapreplace(self._heap, elt) - del(heapmap[self._extracthash(out)]) + del (heapmap[self._extracthash(out)]) return out def __add__(self, obj): @@ -405,7 +405,7 @@ def _replace(self, h, elt): heapmap = self._heapmap heapmap[h] = elt out = heapreplace(self._heap, elt) - del(heapmap[self._extracthash(out)]) + del (heapmap[self._extracthash(out)]) return out def _add(self, subs, nsubs, hashbuffer, heaptop, @@ -541,7 +541,7 @@ class CountTrait(object): def _replace(self, h, elt): out = super()._replace(h, elt) - del(self._count[self._extracthash(out)]) + del (self._count[self._extracthash(out)]) return out def _anynew(self, h): diff --git a/src/parallel.py b/src/parallel.py index 0364d91..d2beebc 100644 --- a/src/parallel.py +++ b/src/parallel.py @@ -2,21 +2,20 @@ Parallelization utilities """ -from functools import reduce -import multiprocessing - class Sketch(object): @staticmethod def initializer(cls, *args): """ - FIXME: use of global not really nice (possible root of mysterious issue for user) + FIXME: use of global not really nice (possible root of mysterious + issue for user) """ global sketch_constructor + def sketch_constructor(): return cls(*args) - + @staticmethod def map_sequence(sequence): """ @@ -59,7 +58,8 @@ class SketchList(object): @staticmethod def initializer(clslist, argslist): """ - FIXME: use of global not really nice (possible root of mysterious issue for user) + FIXME: use of global not really nice (possible root of mysterious + issue for user) """ # Allow automagic expansion of the list of classes @@ -69,12 +69,15 @@ def initializer(clslist, argslist): # Allow automagic expansion of the list of args if len(argslist) == 1: argslist = tuple(argslist[0] for x in range(len(clslist))) - + if len(clslist) != len(argslist): - raise ValueError("The arguments argslist and clslist must be sequences of either the " - "same length, or of length 1.") - + raise ValueError( + "The arguments argslist and clslist must be sequences of " + "either the same length, or of length 1." + ) + global sketchlist_constructor + def sketchlist_constructor(): return (cls(*args) for cls, args in zip(clslist, argslist)) @@ -111,8 +114,9 @@ def reduce(alist, blist): - alist: a sequence of sketches - blist: a sequence of sketches - return alist after each of its elements has been updated to the corresponding element in blist + return alist after each of its elements has been updated to the + corresponding element in blist """ - for a,b in zip(alist, blist): + for a, b in zip(alist, blist): a.update(b) return alist diff --git a/src/tests/test__murmurhash3.py b/src/tests/test__murmurhash3.py index 6e368ba..2964ba1 100644 --- a/src/tests/test__murmurhash3.py +++ b/src/tests/test__murmurhash3.py @@ -1,11 +1,9 @@ -import pytest - import array from mashingpumpkins import _murmurhash3 def test_hasharray(): - nsize=3 + nsize = 3 buffer = array.array('Q', [0, ]) seed = 42 _murmurhash3.hasharray(b"ACG", nsize, buffer, seed) @@ -14,4 +12,3 @@ def test_hasharray(): seed = 43 _murmurhash3.hasharray(b"ACG", nsize, buffer, seed) assert buffer[0] != 1731421407650554201 - diff --git a/src/tests/test_parallel.py b/src/tests/test_parallel.py index e5b2be4..41df85a 100644 --- a/src/tests/test_parallel.py +++ b/src/tests/test_parallel.py @@ -5,6 +5,13 @@ from mashingpumpkins import minhashsketch import random + +def _make_sequence(): + return b''.join( + random.choice((b'A', b'T', b'G', b'C')) for x in range(250) + ) + + def test_sketch_initializer(): # empty initializer @@ -13,13 +20,19 @@ def test_sketch_initializer(): nsize = 21 maxsize = 10 - hashfun = lambda input,width,hashbuffer: None + + def hashfun(input, width, hashbuffer): + return None + seed = 0 cls = minhashsketch.MaxSketch - mashingpumpkins.parallel.Sketch.initializer(cls, nsize, maxsize, hashfun, seed) + mashingpumpkins.parallel.Sketch.initializer( + cls, nsize, maxsize, hashfun, seed + ) hasattr(mashingpumpkins.parallel, 'sketch_constructor') assert type(mashingpumpkins.parallel.sketch_constructor()) is cls + def test_sketch_map_sequence(): nsize = 21 @@ -27,15 +40,18 @@ def test_sketch_map_sequence(): hashfun = hasharray seed = DEFAULT_SEED cls = minhashsketch.MaxSketch - mashingpumpkins.parallel.Sketch.initializer(cls, nsize, maxsize, hashfun, seed) + mashingpumpkins.parallel.Sketch.initializer( + cls, nsize, maxsize, hashfun, seed + ) random.seed(123) - sequence = b''.join(random.choice((b'A',b'T',b'G',b'C')) for x in range(250)) + sequence = _make_sequence() mhs = mashingpumpkins.parallel.Sketch.map_sequence(sequence) assert mhs.nsize == nsize assert mhs.maxsize == maxsize - assert mhs.nvisited == len(sequence)-nsize+1 + assert mhs.nvisited == len(sequence) - nsize + 1 + def test_sketch_map_sequences(): @@ -44,16 +60,22 @@ def test_sketch_map_sequences(): hashfun = hasharray seed = DEFAULT_SEED cls = minhashsketch.MaxSketch - mashingpumpkins.parallel.Sketch.initializer(cls, nsize, maxsize, hashfun, seed) + mashingpumpkins.parallel.Sketch.initializer( + cls, nsize, maxsize, hashfun, seed + ) random.seed(123) - sequence = b''.join(random.choice((b'A',b'T',b'G',b'C')) for x in range(250)) - sequences = (sequence[beg:end] for beg, end in chunkpos_iter(nsize, len(sequence), 100)) + sequence = _make_sequence() + sequences = ( + sequence[beg:end] for beg, end in + chunkpos_iter(nsize, len(sequence), 100) + ) mhs = mashingpumpkins.parallel.Sketch.map_sequences(sequences) assert mhs.nsize == nsize assert mhs.maxsize == maxsize - assert mhs.nvisited == len(sequence)-nsize+1 + assert mhs.nvisited == len(sequence) - nsize + 1 + def test_sketch_reduce_sketches(): @@ -66,13 +88,13 @@ def test_sketch_reduce_sketches(): mhs = cls(nsize, maxsize, hashfun, seed) mhs_a = cls(nsize, maxsize, hashfun, seed) random.seed(123) - sequence = b''.join(random.choice((b'A',b'T',b'G',b'C')) for x in range(250)) + sequence = _make_sequence() mhs.add(sequence) mhs_a.add(sequence) mhs_b = cls(nsize, maxsize, hashfun, seed) random.seed(123) - sequence = b''.join(random.choice((b'A',b'T',b'G',b'C')) for x in range(250)) + sequence = _make_sequence() mhs.add(sequence) mhs_b.add(sequence) @@ -91,7 +113,10 @@ def test_sketchlist_initializer(): nsize = 21 maxsize = 10 - hashfun = lambda input,width,hashbuffer: None + + def hashfun(input, width, hashbuffer): + return None + seed = 0 clslist = (minhashsketch.MaxSketch, minhashsketch.MinSketch) @@ -100,11 +125,13 @@ def test_sketchlist_initializer(): mashingpumpkins.parallel.SketchList.initializer(clslist, []) # automagic length adjustment for argslist - mashingpumpkins.parallel.SketchList.initializer(clslist, [(nsize, maxsize, hashfun, seed)]) + mashingpumpkins.parallel.SketchList.initializer( + clslist, [(nsize, maxsize, hashfun, seed)] + ) hasattr(mashingpumpkins.parallel, 'sketchlist_constructor') - l = tuple(mashingpumpkins.parallel.sketchlist_constructor()) - assert len(l) == len(clslist) - for elt, cls in zip(l, clslist): + tpl = tuple(mashingpumpkins.parallel.sketchlist_constructor()) + assert len(tpl) == len(clslist) + for elt, cls in zip(tpl, clslist): assert type(elt) is cls # automagic length adjustment for clslist @@ -113,9 +140,9 @@ def test_sketchlist_initializer(): mashingpumpkins.parallel.SketchList.initializer(clslist[:1], argslist) hasattr(mashingpumpkins.parallel, 'sketchlist_constructor') - l = tuple(mashingpumpkins.parallel.sketchlist_constructor()) - assert len(l) == 2 - for elt, (nsize, maxsize, hashfun, seed) in zip(l, argslist): + tpl = tuple(mashingpumpkins.parallel.sketchlist_constructor()) + assert len(tpl) == 2 + for elt, (nsize, maxsize, hashfun, seed) in zip(tpl, argslist): assert elt.nsize == nsize assert elt.maxsize == maxsize assert elt._hashfun is hashfun @@ -129,16 +156,19 @@ def test_sketchlist_map_sequence(): hashfun = hasharray seed = DEFAULT_SEED clslist = (minhashsketch.MaxSketch, minhashsketch.MinSketch) - mashingpumpkins.parallel.SketchList.initializer(clslist, [(nsize, maxsize, hashfun, seed)]) + mashingpumpkins.parallel.SketchList.initializer( + clslist, [(nsize, maxsize, hashfun, seed)] + ) random.seed(123) - sequence = b''.join(random.choice((b'A',b'T',b'G',b'C')) for x in range(250)) + sequence = _make_sequence() mhslist = mashingpumpkins.parallel.SketchList.map_sequence(sequence) for mhs in mhslist: assert mhs.nsize == nsize assert mhs.maxsize == maxsize - assert mhs.nvisited == len(sequence)-nsize+1 + assert mhs.nvisited == len(sequence) - nsize + 1 + def test_sketchlist_map_sequences(): @@ -147,17 +177,23 @@ def test_sketchlist_map_sequences(): hashfun = hasharray seed = DEFAULT_SEED cls = minhashsketch.MaxSketch - mashingpumpkins.parallel.Sketch.initializer(cls, nsize, maxsize, hashfun, seed) + mashingpumpkins.parallel.Sketch.initializer( + cls, nsize, maxsize, hashfun, seed + ) random.seed(123) - sequence = b''.join(random.choice((b'A',b'T',b'G',b'C')) for x in range(250)) - sequences = (sequence[beg:end] for beg, end in chunkpos_iter(nsize, len(sequence), 100)) + sequence = _make_sequence() + sequences = ( + sequence[beg:end] for beg, end in + chunkpos_iter(nsize, len(sequence), 100) + ) mhs = mashingpumpkins.parallel.Sketch.map_sequences(sequences) assert mhs.nsize == nsize assert mhs.maxsize == maxsize assert mhs.nvisited == len(sequence)-nsize+1 + def test_sketchlist_reduce_sketches(): nsize = 21 @@ -165,24 +201,28 @@ def test_sketchlist_reduce_sketches(): hashfun = hasharray seed = DEFAULT_SEED clslist = (minhashsketch.MaxSketch, minhashsketch.MinSketch) - mashingpumpkins.parallel.SketchList.initializer(clslist, [(nsize, maxsize, hashfun, seed)]) + mashingpumpkins.parallel.SketchList.initializer( + clslist, [(nsize, maxsize, hashfun, seed)] + ) mhslist = tuple(cls(nsize, maxsize, hashfun, seed) for cls in clslist) mhslist_a = tuple(cls(nsize, maxsize, hashfun, seed) for cls in clslist) random.seed(123) - sequence = b''.join(random.choice((b'A',b'T',b'G',b'C')) for x in range(250)) - for mhs, mhs_a in zip(mhslist,mhslist_a): + sequence = _make_sequence() + for mhs, mhs_a in zip(mhslist, mhslist_a): mhs.add(sequence) mhs_a.add(sequence) mhslist_b = tuple(cls(nsize, maxsize, hashfun, seed) for cls in clslist) random.seed(123) - sequence = b''.join(random.choice((b'A',b'T',b'G',b'C')) for x in range(250)) + sequence = _make_sequence() for mhs, mhs_b in zip(mhslist, mhslist_b): mhs.add(sequence) mhs_b.add(sequence) - mhslist_ab = mashingpumpkins.parallel.SketchList.reduce(mhslist_a, mhslist_b) + mhslist_ab = ( + mashingpumpkins.parallel.SketchList.reduce(mhslist_a, mhslist_b) + ) for mhs, mhs_ab in zip(mhslist, mhslist_ab): assert mhs.nsize == mhs_ab.nsize assert mhs.maxsize == mhs_ab.maxsize