Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix param call #164

Merged
merged 37 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
e96e3f4
fix param call
pdimens Nov 15, 2024
8dcd783
compress
pdimens Nov 15, 2024
1693ca2
clean up the logic a bit
pdimens Nov 15, 2024
1fa481b
add uncompressed
pdimens Nov 15, 2024
b65ae6d
better, easier logic
pdimens Nov 15, 2024
4631f81
swap back to conda
pdimens Nov 15, 2024
e5665d9
improvements and fixes
pdimens Nov 15, 2024
79a440d
rm bc it can be generated on the fly
pdimens Nov 15, 2024
a8404d4
typo
pdimens Nov 15, 2024
bd6163d
another typo
pdimens Nov 15, 2024
06e03c3
clean up
pdimens Nov 15, 2024
d6d10de
final push to make it work
pdimens Nov 15, 2024
d7b4c78
much nicer naming
pdimens Nov 15, 2024
56dbe49
add -a for dwgsim, -g for fastafai, comment out error rate
pdimens Nov 18, 2024
21a9da1
rm todo note
pdimens Nov 18, 2024
fc920e6
take 2-col barcode file
pdimens Nov 18, 2024
2f94d82
rm -r
pdimens Nov 18, 2024
1ce1aa9
update lrsim call
pdimens Nov 18, 2024
12be7a0
fix logic
pdimens Nov 18, 2024
bbbf803
rm len reference
pdimens Nov 18, 2024
0c298f2
rename to HaploSim.pl
pdimens Nov 18, 2024
bd61e72
fix the math
pdimens Nov 18, 2024
a17c527
fixes
pdimens Nov 18, 2024
2e24b90
rename outputs, make .status not hardcoded
pdimens Nov 18, 2024
3ee5d18
clean up
pdimens Nov 18, 2024
618f8a1
refactor the file processing
pdimens Nov 18, 2024
00cc740
refactor workdirs/names
pdimens Nov 18, 2024
045e11f
lower barcode count
pdimens Nov 18, 2024
2847727
add duplication checking
pdimens Nov 18, 2024
0d5f8fb
rm from progress bar b/c it'll be misleading
pdimens Nov 18, 2024
fe1e773
rename rule
pdimens Nov 18, 2024
4875bd4
gzip the barcodes
pdimens Nov 18, 2024
cab9dbd
revert to dict?
pdimens Nov 18, 2024
74d1498
switch to sqllite3 db for memory efficiency
pdimens Nov 19, 2024
c8373cd
rm status file altogether
pdimens Nov 19, 2024
3e4c8b7
rm status file
pdimens Nov 19, 2024
d20ee09
fixes
pdimens Nov 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/filters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ simreads: &simreads
- 'extractReads.cpp'
- 'harpy/bin/inline_to_haplotag.py'
- 'harpy/bin/haplotag_barcodes.py'
- 'harpy/scripts/LRSIM_harpy.pl'
- 'harpy/scripts/HaploSim.pl'
assembly: &assembly
- *common
- *container
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ jobs:
- name: simulate linked reads
shell: micromamba-shell {0}
run: |
gunzip test/haplotag.bc.gz
haplotag_barcodes.py -n 14000000 > test/haplotag.bc
harpy simulate linkedreads --quiet -t 4 -n 2 -b test/haplotag.bc -l 100 -p 50 test/genome/genome.fasta.gz test/genome/genome2.fasta.gz

assembly:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ harpy.egg-info/
__pycache__/
.Benchmark/
extractReads
haplotag.bc
_Inline
205 changes: 90 additions & 115 deletions harpy/_validations.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion harpy/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de
if contigs:
fasta_contig_match(contigs, genome)
if barcode_list:
validate_barcodefile(barcode_list)
validate_barcodefile(barcode_list, False, quiet)
fetch_rule(workflowdir, "align_ema.smk")
fetch_report(workflowdir, "align_stats.Rmd")
fetch_report(workflowdir, "align_bxstats.Rmd")
Expand Down
19 changes: 13 additions & 6 deletions harpy/bin/haplotag_barcodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,26 @@

parser = argparse.ArgumentParser(
prog = 'haplotag_barcodes.py',
description ="Generates a text file listing the haplotagging ACBD barcodes",
usage = "haplotag_barcodes.py > barcodes.txt"
description ="Prints haplotag linked-read barcodes to stdout",
usage = "haplotag_barcodes.py [-n] > barcodes.txt"
)

parser.add_argument("-n", "--number", default=96**4, type = int, help = "How many barcodes to print (min: 1, max: 84934656)")
args = parser.parse_args()

if args.number < 1 or args.number > 96**4:
parser.error("--number must be between 1 and 96^4 (84934656)")
# subtract 1 b/c of python 0-indexing
args.number -= 1

BX = {
"A": ["ACGGAA", "CCAACA", "AGATCG", "TTCTCC", "TTCCTG", "TTCGGT", "TTGTGG", "TTGCCT", "TTGGTC", "TTACGC", "TTAGCG", "TCTTCG", "TCTCTC", "TCTGGA", "TCCACT", "TCGTAC", "TCGATG", "TCACAG", "TGTTGC", "TGTCCA", "TGTGTG", "TGCTAG", "TGCATC", "TGGAGT", "TGAGAC", "TATCGG", "TATGCC", "TACCAC", "TAGGAG", "CTTCGT", "CTTGCA", "CTCTGA", "CTCAAC", "CTGCTA", "CTGGAT", "CTAAGG", "CCTCAA", "CCTGTT", "CCATTC", "CGTTCT", "CGTAGA", "CGGTAA", "CGACTT", "CATACG", "CACTTG", "CACGAA", "CACAGT", "CAGATC", "CAACGA", "CAAGCT", "GTTCAC", "GTCGTA", "GTGTCA", "GTGAAG", "GTAACC", "GCTTGT", "GCCTAA", "GCACTA", "GCAGAT", "GGTGAA", "GGCAAT", "GGATGA", "GGAATG", "GATCCT", "GATAGC", "GACACA", "GAGCAA", "GAGGTT", "ATTCCG", "ATTGGC", "ATCGAG", "ACTACC", "ACCAGA", "ACGTCT", "ACACGT", "ACAGTG", "AGCTGT", "AGCCTA", "AGGTTC", "AGGCAT", "AGGACA", "AGAAGC", "AACGTC", "AAGCTG", "CGAGTA", "GAATCC", "GAATGG", "AAGTGC", "AAGAGG", "TACAGG", "CTGACT", "CTAGTC", "CCTAAG", "CCATAG", "CGTAAC", "CAATGC"],
"C": ["GAAACG", "ACACCA", "TCGAGA", "TCCTTC", "CTGTTC", "GGTTTC", "TGGTTG", "CCTTTG", "GTCTTG", "CGCTTA", "GCGTTA", "TCGTCT", "CTCTCT", "GGATCT", "ACTTCC", "TACTCG", "ATGTCG", "CAGTCA", "TGCTGT", "CCATGT", "GTGTGT", "TAGTGC", "ATCTGC", "AGTTGG", "GACTGA", "CGGTAT", "GCCTAT", "CACTAC", "GAGTAG", "CGTCTT", "GCACTT", "TGACTC", "AACCTC", "CTACTG", "GATCTG", "AGGCTA", "CAACCT", "GTTCCT", "TTCCCA", "TCTCGT", "AGACGT", "TAACGG", "CTTCGA", "ACGCAT", "TTGCAC", "GAACAC", "AGTCAC", "ATCCAG", "CGACAA", "GCTCAA", "CACGTT", "GTAGTC", "TCAGTG", "AAGGTG", "ACCGTA", "TGTGCT", "TAAGCC", "CTAGCA", "GATGCA", "GAAGGT", "AATGGC", "TGAGGA", "ATGGGA", "CCTGAT", "AGCGAT", "ACAGAC", "CAAGAG", "GTTGAG", "CCGATT", "GGCATT", "GAGATC", "ACCACT", "AGAACC", "TCTACG", "CGTACA", "GTGACA", "TGTAGC", "CTAAGC", "TTCAGG", "CATAGG", "ACAAGG", "AGCAGA", "GTCAAC", "CTGAAG", "GTACGA", "TCCGAA", "TGGGAA", "TGCAAG", "AGGAAG", "AGGTAC", "ACTCTG", "GTCCTA", "AAGCCT", "TAGCCA", "AACCGT", "TGCCAA"],
"B": ["AACGGA", "ACCAAC", "GAGATC", "CTTCTC", "GTTCCT", "TTTCGG", "GTTGTG", "TTTGCC", "CTTGGT", "CTTACG", "GTTAGC", "GTCTTC", "CTCTCT", "ATCTGG", "TTCCAC", "CTCGTA", "GTCGAT", "GTCACA", "CTGTTG", "ATGTCC", "GTGTGT", "GTGCTA", "CTGCAT", "TTGGAG", "CTGAGA", "GTATCG", "CTATGC", "CTACCA", "GTAGGA", "TCTTCG", "ACTTGC", "ACTCTG", "CCTCAA", "ACTGCT", "TCTGGA", "GCTAAG", "ACCTCA", "TCCTGT", "CCCATT", "TCGTTC", "ACGTAG", "ACGGTA", "TCGACT", "GCATAC", "GCACTT", "ACACGA", "TCACAG", "CCAGAT", "ACAACG", "TCAAGC", "CGTTCA", "AGTCGT", "AGTGTC", "GGTGAA", "CGTAAC", "TGCTTG", "AGCCTA", "AGCACT", "TGCAGA", "AGGTGA", "TGGCAA", "AGGATG", "GGGAAT", "TGATCC", "CGATAG", "AGACAC", "AGAGCA", "TGAGGT", "GATTCC", "CATTGG", "GATCGA", "CACTAC", "AACCAG", "TACGTC", "TACACG", "GACAGT", "TAGCTG", "AAGCCT", "CAGGTT", "TAGGCA", "AAGGAC", "CAGAAG", "CAACGT", "GAAGCT", "ACGAGT", "CGAATC", "GGAATG", "CAAGTG", "GAAGAG", "GTACAG", "TCTGAC", "CCTAGT", "GCCTAA", "GCCATA", "CCGTAA", "CCAATG"],
"D": ["GGAAAC", "AACACC", "ATCGAG", "CTCCTT", "CCTGTT", "CGGTTT", "GTGGTT", "GCCTTT", "GGTCTT", "ACGCTT", "AGCGTT", "TTCGTC", "TCTCTC", "TGGATC", "CACTTC", "GTACTC", "GATGTC", "ACAGTC", "TTGCTG", "TCCATG", "TGTGTG", "CTAGTG", "CATCTG", "GAGTTG", "AGACTG", "TCGGTA", "TGCCTA", "CCACTA", "GGAGTA", "TCGTCT", "TGCACT", "CTGACT", "CAACCT", "GCTACT", "GGATCT", "AAGGCT", "TCAACC", "TGTTCC", "ATTCCC", "TTCTCG", "TAGACG", "GTAACG", "ACTTCG", "TACGCA", "CTTGCA", "CGAACA", "CAGTCA", "GATCCA", "ACGACA", "AGCTCA", "TCACGT", "CGTAGT", "GTCAGT", "GAAGGT", "AACCGT", "TTGTGC", "CTAAGC", "ACTAGC", "AGATGC", "TGAAGG", "CAATGG", "ATGAGG", "AATGGG", "TCCTGA", "TAGCGA", "CACAGA", "GCAAGA", "GGTTGA", "TCCGAT", "TGGCAT", "CGAGAT", "TACCAC", "CAGAAC", "GTCTAC", "ACGTAC", "AGTGAC", "CTGTAG", "CCTAAG", "GTTCAG", "GCATAG", "GACAAG", "AAGCAG", "CGTCAA", "GCTGAA", "AGTACG", "ATCCGA", "ATGGGA", "GTGCAA", "GAGGAA", "CAGGTA", "GACTCT", "AGTCCT", "TAAGCC", "ATAGCC", "TAACCG", "ATGCCA"]
}

bc_generator = product(BX["A"], BX["C"], BX["B"], BX["D"])
for BC in bc_generator:
sys.stdout.write("".join(BC) + "\n")
for i,BC in enumerate(bc_generator):
sys.stdout.write("".join(BC) + "\n")
if i >= args.number:
pdimens marked this conversation as resolved.
Show resolved Hide resolved
break
137 changes: 65 additions & 72 deletions harpy/bin/inline_to_haplotag.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,16 @@
import sys
import gzip
import argparse
from itertools import zip_longest, product
from itertools import zip_longest

parser = argparse.ArgumentParser(
prog = 'inline_to_haplotag.py',
description = 'Moves inline linked read barcodes to read headers (OX:Z) and converts them into haplotag ACBD format (BX:Z).',
usage = "inline_to_haplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix>",
description = 'Moves inline linked read barcodes to read headers (OX:Z) and converts them into haplotag ACBD format (BX:Z). Barcodes must all be the same length.',
usage = f"inline_to_haplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix>",
exit_on_error = False
)

parser.add_argument("-f", "--forward", required = True, type = str, help = "Forward reads of paired-end FASTQ file pair (gzipped)")
parser.add_argument("-r", "--reverse", required = True, type = str, help = "Reverse reads of paired-end FASTQ file pair (gzipped)")
parser.add_argument("-l", "--length", required = True, type = str, help = "Length of the barcodes (all must be one length)")
parser.add_argument("-p", "--prefix", required = True, type = str, help = "Prefix for outfile files (e.g. <prefix>.R1.fq.gz)")
parser.add_argument("-b", "--barcodes", required = True, type=str, help="File listing the linked-read barcodes to convert to haplotag format, one barcode per line")
if len(sys.argv) == 1:
Expand All @@ -29,90 +27,85 @@
if err:
parser.error("Some input files were not found on the system:\n" + ", ".join(err))

def iter_fastq_records(file_handle):
"""Iterate over FASTQ records in a file.
file_handle: Opened gzip file handle
Yields: FASTQ record [header, seq, '+', qual]
Raises ValueError If file is not in FASTQ format
"""
record = []
for line in file_handle:
line = line.decode().rstrip("\n")
record.append(line)
if len(record) == 4:
# format sanity check
if not (record[0].startswith("@") and record[2] == "+"):
raise ValueError("Invalid FASTQ format")
yield record
record = []
if record:
raise ValueError("Incomplete FASTQ record at end of file")

def validate_barcode(barcode):
"""Validate barcode format (A,C,G,T)."""
if not set(barcode).issubset({'A','C','G','T'}):
raise ValueError(f"Invalid barcode format: {barcode}. Barcodes must be captial letters and only contain standard nucleotide values ATCG.")
def valid_record(fq_rec, FR):
"""fastq format sanity check"""
if not (fq_rec[0].startswith("@") and fq_rec[2] == "+"):
raise ValueError(f"Invalid FASTQ format for {FR} reads")

def process_record(fw_entry, rv_entry, barcode_dict, haplotag_bc, bc_len):
def process_record(fw_entry, rv_entry, barcode_dict, bc_len):
"""convert the barcode to haplotag"""
# [0] = header, [1] = seq, [2] = +, [3] = qual
# search for a valid barcode at all possible barcode lengths
if fw_entry:
valid_record(fw_entry, "forward")
bc_inline = fw_entry[1][:bc_len]
bc_hap = barcode_dict.get(bc_inline, "A00C00B00D00")
# the default barcode entry is None, meaning it hasnt been assigned a haplotag equivalent yet
if not bc_hap:
bc_hap = "".join(next(haplotag_bc))
barcode_dict[bc_inline] = bc_hap
_new_fw = fw_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}\n"
_new_fw += fw_entry[1][bc_len:] + "\n"
_new_fw += fw_entry[2] + "\n"
_new_fw += fw_entry[3][bc_len:] + "\n"
fw_entry[0] = fw_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}"
fw_entry[1] = fw_entry[1][bc_len:]
fw_entry[3] = fw_entry[3][bc_len:]
_new_fw = "\n".join(fw_entry) + "\n"
if rv_entry:
_new_rv = rv_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}\n"
_new_rv += rv_entry[1] + "\n"
_new_rv += rv_entry[2] + "\n"
_new_rv += rv_entry[3] + "\n"
valid_record(rv_entry, "reverse")
rv_entry[0] = rv_entry[0].split()[0] + f"\tOX:Z:{bc_inline}\tBX:Z:{bc_hap}"
_new_rv = "\n".join(rv_entry) + "\n"
else:
_new_rv = None
return _new_fw, _new_rv
else:
_new_fw = None
# no forward read, therefor no barcode to search for
_new_rv = rv_entry[0].split()[0] + f"\tBX:Z:A00C00B00D00\n"
_new_rv += rv_entry[1] + "\n"
_new_rv += rv_entry[2] + "\n"
_new_rv += rv_entry[3] + "\n"
return None, _new_rv

bc_range = [f"{i}".zfill(2) for i in range(1,97)]
bc_generator = product("A", bc_range, "C", bc_range, "B", bc_range, "D", bc_range)
if rv_entry:
valid_record(rv_entry, "reverse")
rv_entry[0] = rv_entry[0].split()[0] + "\tBX:Z:A00C00B00D00"
_new_rv = "\n".join(rv_entry) + "\n"
else:
_new_rv = None
return _new_fw, _new_rv

nucleotides = {'A','C','G','T'}
lengths = set()
bc_dict = {}

# read in barcodes
opener = gzip.open if args.barcodes.lower().endswith('.gz') else open
mode = 'rt' if args.barcodes.lower().endswith('.gz') else 'r'
with opener(args.barcodes, mode) as bc_file:
for line in bc_file:
barcode = line.rstrip().split()[0]
validate_barcode(barcode)
bc_dict[barcode] = None
try:
ATCG,ACBD = line.rstrip().split()
except ValueError:
sys.stderr.write(f"Invalid barcode entry: {line.rstrip()}\nExpected two entries: a nucleotide barcode and ACBD format barcode with a space/tab separatng them, e.g. ATATCAGA A01C22B13D93")
sys.exit(1)
if not set(ATCG).issubset(nucleotides):
sys.stderr.write(f"Invalid barcode format: {ATCG}. Barcodes must be captial letters and only contain standard nucleotide values ATCG.\n")
sys.exit(1)
bc_dict[ATCG] = ACBD
lengths.add(len(ATCG))
if len(lengths) > 1:
sys.stderr.write("Can only search sequences for barcodes of a single length, but multiple barcode legnths detected: " + ",".join([str(i) for i in lengths]))
else:
bc_len = lengths.pop()

# simultaneously iterate the forward and reverse fastq files
fw_out = gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6)
rv_out = gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6)

with gzip.open(args.forward, "r") as fw_i, gzip.open(args.reverse, "r") as rv_i:
for fw_record, rv_record in zip_longest(iter_fastq_records(fw_i), iter_fastq_records(rv_i)):
new_fw, new_rv = process_record(fw_record, rv_record, bc_dict, bc_generator, args.length)
if new_fw:
fw_out.write(new_fw.encode("utf-8"))
if new_rv:
rv_out.write(new_rv.encode("utf-8"))

fw_out.close()
rv_out.close()

with open(f"{args.prefix}.barcodes", "w") as bx_out:
for i,j in bc_dict.items():
if j:
bx_out.write(f"{i}\t{j}\n")
with gzip.open(args.forward, "r") as fw_i, gzip.open(args.reverse, "r") as rv_i,\
gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6) as fw_out,\
gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6) as rv_out:
record_F = []
record_R = []
for fw_record, rv_record in zip_longest(fw_i, rv_i):
try:
record_F.append(fw_record.decode().rstrip("\n"))
except AttributeError:
# if the file ends before the other one
pass
try:
record_R.append(rv_record.decode().rstrip("\n"))
except AttributeError:
pass
# sanity checks
if len(record_F) == 4 or len(record_R) == 4:
new_fw, new_rv = process_record(record_F, record_R, bc_dict, bc_len)
if new_fw:
fw_out.write(new_fw.encode("utf-8"))
record_F = []
if new_rv:
rv_out.write(new_rv.encode("utf-8"))
record_R = []
Loading
Loading