Skip to content

Commit

Permalink
feat!: upgrade TRUNC512 to GA4Gh algo
Browse files Browse the repository at this point in the history
  • Loading branch information
davidlougheed committed Nov 16, 2023
1 parent aa2aae4 commit 6ae2be7
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 18 deletions.
25 changes: 13 additions & 12 deletions fasta_checksum_utils/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import aiofiles
import binascii
import base64
import hashlib
from abc import abstractmethod
from pathlib import Path
Expand All @@ -8,11 +8,11 @@
__all__ = [
"ChecksumAlgorithm",
"AlgorithmMD5",
"AlgorithmTRUNC512",
"AlgorithmGA4GH",
]

DEFAULT_CHUNK_SIZE = 128 * 1024 # 128 KB
DEFAULT_TRUNC512_OFFSET = 24
DEFAULT_GA4GH_OFFSET = 24


class ChecksumAlgorithm(type):
Expand Down Expand Up @@ -56,7 +56,7 @@ async def checksum_sequence(mcs, sequence: Generator[bytes, None, None]) -> str:
pass


class AlgorithmMD5(metaclass=ChecksumAlgorithm, algorithm_name="MD5"):
class AlgorithmMD5(metaclass=ChecksumAlgorithm, algorithm_name="md5"):

@classmethod
async def checksum_file(cls, file: Path, chunk_size: int = DEFAULT_CHUNK_SIZE, **_kwargs) -> str:
Expand All @@ -67,22 +67,23 @@ async def checksum_sequence(cls, sequence: Generator[bytes, None, None], **_kwar
return ChecksumAlgorithm.update_hash_from_sequence(hashlib.md5(), sequence).hexdigest()


class AlgorithmTRUNC512(metaclass=ChecksumAlgorithm, algorithm_name="TRUNC512"):
class AlgorithmGA4GH(metaclass=ChecksumAlgorithm, algorithm_name="ga4gh"):

@staticmethod
def _trunc512_of_hash(h, offset: int) -> str:
return binascii.hexlify(h.digest()[:offset]).decode("ascii")
def _ga4gh_of_hash(h, offset: int) -> str:
b64_enc = base64.urlsafe_b64encode(h.digest()[:offset]).decode("ascii")
return f"SQ.{b64_enc}"

@classmethod
async def checksum_file(cls, file: Path, chunk_size: int = DEFAULT_CHUNK_SIZE, **kwargs) -> str:
return cls._trunc512_of_hash(
async def checksum_file(cls, file: Path, chunk_size: int = DEFAULT_CHUNK_SIZE, **kwargs):
return cls._ga4gh_of_hash(
h=await cls.update_hash_from_file(hashlib.sha512(), file, chunk_size),
offset=kwargs.pop("offset", DEFAULT_TRUNC512_OFFSET),
offset=kwargs.pop("offset", DEFAULT_GA4GH_OFFSET),
)

@classmethod
async def checksum_sequence(cls, sequence: Generator[bytes, None, None], **kwargs) -> str:
return cls._trunc512_of_hash(
return cls._ga4gh_of_hash(
h=cls.update_hash_from_sequence(hashlib.sha512(), sequence),
offset=kwargs.pop("offset", DEFAULT_TRUNC512_OFFSET),
offset=kwargs.pop("offset", DEFAULT_GA4GH_OFFSET),
)
12 changes: 6 additions & 6 deletions tests/test_checksums.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@
import pysam
import pytest
from fasta_checksum_utils import checksum_file, checksum_contig
from fasta_checksum_utils.algorithms import AlgorithmMD5, AlgorithmTRUNC512
from fasta_checksum_utils.algorithms import AlgorithmMD5, AlgorithmGA4GH
from fasta_checksum_utils.fasta import fasta_report


EXAMPLE_FASTA = pathlib.Path(__file__).parent / "data" / "example.fa"
TESTED_ALGORITHMS = (AlgorithmMD5, AlgorithmTRUNC512)
TESTED_ALGORITHMS = (AlgorithmMD5, AlgorithmGA4GH)


@pytest.mark.asyncio
async def test_file_checksums():
assert (await checksum_file(EXAMPLE_FASTA, TESTED_ALGORITHMS)) == (
"3cc31e8136477d1c7d7e2b7c050c06bd",
"e9b98ddd7ba5cc3622199a535ec32448542012bb6e143df1",
"SQ.6bmN3XulzDYiGZpTXsMkSFQgErtuFD3x",
)


Expand All @@ -25,7 +25,7 @@ async def test_contig_checksums():
try:
assert (await checksum_contig(fh, "chr1", TESTED_ALGORITHMS)) == (
"bd6a33a85050db787b28c0c8230aaa80",
"a13b5e54899ec0ad3f67a71403673b4146f961a02af0783d",
"SQ.oTteVImewK0_Z6cUA2c7QUb5YaAq8Hg9",
)
finally:
fh.close()
Expand All @@ -47,11 +47,11 @@ async def test_fasta_report():

# file checksums
assert json_data["md5"] == "3cc31e8136477d1c7d7e2b7c050c06bd"
assert json_data["trunc512"] == "e9b98ddd7ba5cc3622199a535ec32448542012bb6e143df1"
assert json_data["ga4gh"] == "SQ.6bmN3XulzDYiGZpTXsMkSFQgErtuFD3x"

# contigs
assert len(json_data["contigs"]) == 2
assert json_data["contigs"][0]["md5"] == "bd6a33a85050db787b28c0c8230aaa80"
assert json_data["contigs"][0]["trunc512"] == "a13b5e54899ec0ad3f67a71403673b4146f961a02af0783d"
assert json_data["contigs"][0]["ga4gh"] == "SQ.oTteVImewK0_Z6cUA2c7QUb5YaAq8Hg9"
assert json_data["contigs"][0]["length"] == 33
assert json_data["contigs"][1]["length"] == 28

0 comments on commit 6ae2be7

Please sign in to comment.