From 9f159df451bdec96c2a2a99349999d72b4733f17 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 17 Nov 2023 10:27:56 -0500 Subject: [PATCH] docs: update README with GA4GH hash --- README.md | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 2a264a0..ba307cf 100644 --- a/README.md +++ b/README.md @@ -23,17 +23,17 @@ fasta-checksum-utils ./my-fasta.fa[.gz] This will print output in the following tab-delimited format: ``` -file [file size in bytes] MD5 [file MD5 hash] TRUNC512 [file TRUNC512 hash] -chr1 [chr1 sequence length] MD5 [chr1 sequence MD5 hash] TRUNC512 [chr1 sequence TRUNC512 hash] -chr2 [chr2 sequence length] MD5 [chr2 sequence MD5 hash] TRUNC512 [chr2 sequence TRUNC512 hash] +file [file size in bytes] md5 [file MD5 hash] ga4gh [file GA4GH hash] +chr1 [chr1 sequence length] md5 [chr1 sequence MD5 hash] ga4gh [chr1 sequence GA4GH hash] +chr2 [chr2 sequence length] md5 [chr2 sequence MD5 hash] ga4gh [chr2 sequence GA4GH hash] ... ``` The following example is the output generated by specifying the SARS-CoV-2 genome FASTA from NCBI: ``` -file 30429 MD5 863ee5dba1da0ca3f87783782284d489 TRUNC512 3036e94352072c8cd4b5d2e855a72af3d4ca010f6fac1353 -NC_045512.2 29903 MD5 105c82802b67521950854a851fc6eefd TRUNC512 4b2195260fd845e771bec8e9a8d754832caac7b9547eefc3 +file 30428 md5 825ab3c54b7a67ff2db55262eb532438 ga4gh SQ.mMg8qNej7pU84juQQWobw9JyUy09oYdd +NC_045512.2 29903 md5 105c82802b67521950854a851fc6eefd ga4gh SQ.SyGVJg_YRedxvsjpqNdUgyyqx7lUfu_D ``` If the `--out-format bento-json` arguments are passed, the tool will instead output the report in a JSON @@ -43,20 +43,24 @@ is the output generated by specifying the SARS-CoV-2 genome: ```json { - "md5": "863ee5dba1da0ca3f87783782284d489", - "trunc512": "3036e94352072c8cd4b5d2e855a72af3d4ca010f6fac1353", - "fasta_size": 30429, + "fasta": "sars_cov_2.fa", + "fasta_size": 30428, + "md5": "825ab3c54b7a67ff2db55262eb532438", + "ga4gh": "SQ.mMg8qNej7pU84juQQWobw9JyUy09oYdd", "contigs": [ { "name": "NC_045512.2", "md5": "105c82802b67521950854a851fc6eefd", - "trunc512": "4b2195260fd845e771bec8e9a8d754832caac7b9547eefc3", + "ga4gh": "SQ.SyGVJg_YRedxvsjpqNdUgyyqx7lUfu_D", "length": 29903 } ] } ``` +If an argument like `--fai [path or URL]` is passed, an additional `"fai": "..."` property will be added to the JSON +object output. + If an argument like `--genome-id GRCh38` is provided, an additional `"id": "GRCh38"` property will be added to the JSON object output. @@ -80,12 +84,12 @@ async def demo(): print(file_checksum) # prints "863ee5dba1da0ca3f87783782284d489" - all_algorithms = (fc.algorithms.AlgorithmMD5, fc.algorithms.AlgorithmTRUNC512) + all_algorithms = (fc.algorithms.AlgorithmMD5, fc.algorithms.AlgorithmGA4GH) # calculate multiple checksums for a whole file all_checksums: tuple[str, ...] = await fc.checksum_file(file=covid_genome, algorithms=all_algorithms) print(all_checksums) - # prints tuple: ("863ee5dba1da0ca3f87783782284d489", "3036e94352072c8cd4b5d2e855a72af3d4ca010f6fac1353") + # prints tuple: ("863ee5dba1da0ca3f87783782284d489", "SQ.mMg8qNej7pU84juQQWobw9JyUy09oYdd") # calculate an MD5 and TRUNC512 checksum for a specific contig in a PySAM FASTA file: fh = pysam.FastaFile(str(covid_genome)) @@ -96,7 +100,7 @@ async def demo(): algorithms=all_algorithms, ) print(contig_checksums) - # prints tuple: ("105c82802b67521950854a851fc6eefd", "4b2195260fd845e771bec8e9a8d754832caac7b9547eefc3") + # prints tuple: ("105c82802b67521950854a851fc6eefd", "SQ.SyGVJg_YRedxvsjpqNdUgyyqx7lUfu_D") finally: fh.close() # always close the file handle