Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix param call #164

Merged
merged 37 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
e96e3f4
fix param call
pdimens Nov 15, 2024
8dcd783
compress
pdimens Nov 15, 2024
1693ca2
clean up the logic a bit
pdimens Nov 15, 2024
1fa481b
add uncompressed
pdimens Nov 15, 2024
b65ae6d
better, easier logic
pdimens Nov 15, 2024
4631f81
swap back to conda
pdimens Nov 15, 2024
e5665d9
improvements and fixes
pdimens Nov 15, 2024
79a440d
rm bc it can be generated on the fly
pdimens Nov 15, 2024
a8404d4
typo
pdimens Nov 15, 2024
bd6163d
another typo
pdimens Nov 15, 2024
06e03c3
clean up
pdimens Nov 15, 2024
d6d10de
final push to make it work
pdimens Nov 15, 2024
d7b4c78
much nicer naming
pdimens Nov 15, 2024
56dbe49
add -a for dwgsim, -g for fastafai, comment out error rate
pdimens Nov 18, 2024
21a9da1
rm todo note
pdimens Nov 18, 2024
fc920e6
take 2-col barcode file
pdimens Nov 18, 2024
2f94d82
rm -r
pdimens Nov 18, 2024
1ce1aa9
update lrsim call
pdimens Nov 18, 2024
12be7a0
fix logic
pdimens Nov 18, 2024
bbbf803
rm len reference
pdimens Nov 18, 2024
0c298f2
rename to HaploSim.pl
pdimens Nov 18, 2024
bd61e72
fix the math
pdimens Nov 18, 2024
a17c527
fixes
pdimens Nov 18, 2024
2e24b90
rename outputs, make .status not hardcoded
pdimens Nov 18, 2024
3ee5d18
clean up
pdimens Nov 18, 2024
618f8a1
refactor the file processing
pdimens Nov 18, 2024
00cc740
refactor workdirs/names
pdimens Nov 18, 2024
045e11f
lower barcode count
pdimens Nov 18, 2024
2847727
add duplication checking
pdimens Nov 18, 2024
0d5f8fb
rm from progress bar b/c it'll be misleading
pdimens Nov 18, 2024
fe1e773
rename rule
pdimens Nov 18, 2024
4875bd4
gzip the barcodes
pdimens Nov 18, 2024
cab9dbd
revert to dict?
pdimens Nov 18, 2024
74d1498
switch to sqllite3 db for memory efficiency
pdimens Nov 19, 2024
c8373cd
rm status file altogether
pdimens Nov 19, 2024
3e4c8b7
rm status file
pdimens Nov 19, 2024
d20ee09
fixes
pdimens Nov 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ jobs:
- name: simulate linked reads
shell: micromamba-shell {0}
run: |
gunzip test/haplotag.bc.gz
haplotag_barcodes.py -n 14500000 > test/haplotag.bc
harpy simulate linkedreads --quiet -t 4 -n 2 -b test/haplotag.bc -l 100 -p 50 test/genome/genome.fasta.gz test/genome/genome2.fasta.gz

assembly:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ harpy.egg-info/
__pycache__/
.Benchmark/
extractReads
haplotag.bc
_Inline
190 changes: 76 additions & 114 deletions harpy/_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,17 +169,16 @@ def check_RG(bamfile):
if samplename != samview[0]:
return os.path.basename(bamfile), samview[0]

with harpy_progressbar(quiet) as progress:
task_progress = progress.add_task("[green]Checking RG tags...", total=len(bamlist))
with ThreadPoolExecutor(max_workers=threads) as executor:
futures = [executor.submit(check_RG, i) for i in bamlist]
for future in as_completed(futures):
result = future.result()
progress.update(task_progress, advance=1)
if result:
culpritfiles.append(result[0])
culpritIDs.append(result[1])

with harpy_progressbar(quiet) as progress, ThreadPoolExecutor(max_workers=threads) as executor:
task_progress = progress.add_task("[green]Checking RG tags", total=len(bamlist))
futures = [executor.submit(check_RG, i) for i in bamlist]
for future in as_completed(futures):
result = future.result()
progress.update(task_progress, advance=1)
if result:
culpritfiles.append(result[0])
culpritIDs.append(result[1])

if len(culpritfiles) > 0:
print_error("sample ID mismatch",f"There are [bold]{len(culpritfiles)}[/bold] alignment files whose RG tags do not match their filenames.")
print_solution_with_culprits(
Expand Down Expand Up @@ -289,33 +288,6 @@ def validate_demuxschema(infile):
_ = [click.echo(f"{i[0]+1}\t{i[1]}", file = sys.stderr) for i in invalids]
sys.exit(1)

def check_demux_fastq(file):
"""Check for the presence of corresponding FASTQ files from a single provided FASTQ file based on pipeline expectations."""
bn = os.path.basename(file)
if not bn.lower().endswith("fq") and not bn.lower().endswith("fastq") and not bn.lower().endswith("fastq.gz") and not bn.lower().endswith("fq.gz"):
print_error("unrecognized extension", f"The file [blue]{bn}[/blue] is not recognized as a FASTQ file by the file extension.")
print_solution("Make sure the input file ends with a standard FASTQ extension. These are not case-sensitive.\nAccepted extensions: [green bold].fq .fastq .fq.gz .fastq.gz[/green bold]")
sys.exit(1)
ext = re.search(r"(?:\_00[0-9])*\.f(.*?)q(?:\.gz)?$", file, re.IGNORECASE).group(0)
prefix = re.sub(r"[\_\.][IR][12]?(?:\_00[0-9])*\.f(?:ast)?q(?:\.gz)?$", "", bn)
prefixfull = re.sub(r"[\_\.][IR][12]?(?:\_00[0-9])*\.f(?:ast)?q(?:\.gz)?$", "", file)
filelist = []
printerr = False
for i in ["I1", "I2","R1","R2"]:
chkfile = f"{prefixfull}_{i}{ext}"
TF = os.path.exists(chkfile)
printerr = True if not TF else printerr
symbol = " " if TF else "X"
filelist.append(f"\033[91m{symbol}\033[0m {prefix}_{i}{ext}")
if printerr:
print_error("missing files", f"Not all necessary files with prefix [bold]{prefix}[/bold] present")
print_solution_with_culprits(
"Demultiplexing requires 4 sequence files: the forward and reverse sequences (R1 and R2), along with the forward and reverse indices (I2 and I2).",
"Necessary/expected files:"
)
_ = [click.echo(i, file = sys.stderr) for i in filelist]
sys.exit(1)

def validate_regions(regioninput, genome):
"""validates the --regions input of harpy snp to infer whether it's an integer, region, or file"""
try:
Expand Down Expand Up @@ -350,22 +322,15 @@ def validate_regions(regioninput, genome):
# check if the region is in the genome

contigs = {}
if is_gzip(genome):
with gzip.open(genome, "rt") as fopen:
for line in fopen:
if line.startswith(">"):
cn = line.rstrip("\n").lstrip(">").split()[0]
contigs[cn] = 0
else:
contigs[cn] += len(line.rstrip("\n")) - 1
else:
with open(genome, "r", encoding="utf-8") as fout:
for line in fopen:
if line.startswith(">"):
cn = line.rstrip("\n").lstrip(">").split()[0]
contigs[cn] = 0
else:
contigs[cn] += len(line.rstrip("\n")) - 1
opener = gzip.open if is_gzip(genome) else open
mode = "rt" if is_gzip(genome) else "r"
with opener(genome, mode) as fopen:
for line in fopen:
if line.startswith(">"):
cn = line.rstrip("\n").lstrip(">").split()[0]
contigs[cn] = 0
else:
contigs[cn] += len(line.rstrip("\n")) - 1
pdimens marked this conversation as resolved.
Show resolved Hide resolved
err = ""
if reg[0] not in contigs:
print_error("contig not found", f"The contig ([bold yellow]{reg[0]})[/bold yellow]) of the input region [yellow bold]{regioninput}[/yellow bold] was not found in [blue]{genome}[/blue].")
Expand All @@ -386,12 +351,8 @@ def validate_regions(regioninput, genome):
with open(regioninput, "r", encoding="utf-8") as fin:
badrows = []
idx = 0
while True:
line = fin.readline()
if not line:
break
else:
idx += 1
for line in fin:
idx += 1
pdimens marked this conversation as resolved.
Show resolved Hide resolved
row = line.split()
if len(row) != 3:
badrows.append(idx)
Expand All @@ -415,43 +376,38 @@ def validate_regions(regioninput, genome):
def check_fasta(genofile):
"""perform validations on fasta file for extensions and file contents"""
# validate fasta file contents
if is_gzip(genofile):
fasta = gzip.open(genofile, 'rt')
elif is_plaintext(genofile):
fasta = open(genofile, 'r', encoding="utf-8")
else:
print_error("unknown file type", f"Unable to determine file encoding for [blue]{genofile}[/blue]. Please check that it is a gzipped or uncompressed FASTA file.")
sys.exit(1)
opener = gzip.open if is_gzip(genofile) else open
mode = "rt" if is_gzip(genofile) else "r"
line_num = 0
seq_id = 0
seq = 0
last_header = False
for line in fasta:
line_num += 1
if line.startswith(">"):
seq_id += 1
if last_header:
print_error("consecutive contig names", f"All contig names must be followed by at least one line of nucleotide sequences, but two consecutive lines of contig names were detected. This issue was identified at line [bold]{line_num}[/bold] in [blue]{genofile}[/blue], but there may be others further in the file.")
with opener(genofile, mode) as fasta:
for line in fasta:
line_num += 1
if line.startswith(">"):
seq_id += 1
if last_header:
print_error("consecutive contig names", f"All contig names must be followed by at least one line of nucleotide sequences, but two consecutive lines of contig names were detected. This issue was identified at line [bold]{line_num}[/bold] in [blue]{genofile}[/blue], but there may be others further in the file.")
print_solution("See the FASTA file spec and try again after making the appropriate changes: https://www.ncbi.nlm.nih.gov/genbank/fastaformat/")
sys.exit(1)
else:
last_header = True
if len(line.rstrip()) == 1:
print_error("unnamed contigs", f"All contigs must have an alphanumeric name, but a contig was detected without a name. This issue was identified at line [bold]{line_num}[/bold] in [blue]{genofile}[/blue], but there may be others further in the file.")
print_solution("See the FASTA file spec and try again after making the appropriate changes: https://www.ncbi.nlm.nih.gov/genbank/fastaformat/")
sys.exit(1)
if line.startswith("> "):
print_error("invalid contig names", f"All contig names must be named [green bold]>contig_name[/green bold], without a space, but a contig was detected with a space between the [green bold]>[/green bold] and contig_name. This issue was identified at line [bold]{line_num}[/bold] in [blue]{genofile}[/blue], but there may be others further in the file.")
print_solution("See the FASTA file spec and try again after making the appropriate changes: https://www.ncbi.nlm.nih.gov/genbank/fastaformat/")
sys.exit(1)
elif line == "\n":
print_error("empty lines", f"Empty lines are not permitted in FASTA files, but one was detected at line [bold]{line_num}[/bold] in [blue]{genofile}[/blue]. The scan ended at this error, but there may be others further in the file.")
print_solution("See the FASTA file spec and try again after making the appropriate changes: https://www.ncbi.nlm.nih.gov/genbank/fastaformat/")
sys.exit(1)
else:
last_header = True
if len(line.rstrip()) == 1:
print_error("unnamed contigs", f"All contigs must have an alphanumeric name, but a contig was detected without a name. This issue was identified at line [bold]{line_num}[/bold] in [blue]{genofile}[/blue], but there may be others further in the file.")
print_solution("See the FASTA file spec and try again after making the appropriate changes: https://www.ncbi.nlm.nih.gov/genbank/fastaformat/")
sys.exit(1)
if line.startswith("> "):
print_error("invalid contig names", f"All contig names must be named [green bold]>contig_name[/green bold], without a space, but a contig was detected with a space between the [green bold]>[/green bold] and contig_name. This issue was identified at line [bold]{line_num}[/bold] in [blue]{genofile}[/blue], but there may be others further in the file.")
print_solution("See the FASTA file spec and try again after making the appropriate changes: https://www.ncbi.nlm.nih.gov/genbank/fastaformat/")
sys.exit(1)
elif line == "\n":
print_error("empty lines", f"Empty lines are not permitted in FASTA files, but one was detected at line [bold]{line_num}[/bold] in [blue]{genofile}[/blue]. The scan ended at this error, but there may be others further in the file.")
print_solution("See the FASTA file spec and try again after making the appropriate changes: https://www.ncbi.nlm.nih.gov/genbank/fastaformat/")
sys.exit(1)
else:
seq += 1
last_header = False
fasta.close()
seq += 1
last_header = False
solutiontext = "FASTA files must have at least one contig name followed by sequence data on the next line. Example:\n"
solutiontext += "[green] >contig_name\n ATACAGGAGATTAGGCA[/green]\n"
# make sure there is at least one of each
Expand Down Expand Up @@ -497,10 +453,7 @@ def validate(fastq):
else:
fq = open(fastq, "r")
with fq:
while True:
line = fq.readline()
if not line:
break
for line in fq:
if not line.startswith("@"):
continue
BX = True if "BX:Z" in line else BX
Expand All @@ -516,34 +469,43 @@ def validate(fastq):
sys.exit(1)

# parellelize over the fastq list
with harpy_progressbar(quiet) as progress:
task_progress = progress.add_task("[green]Validating FASTQ inputs...", total=len(fastq_list))
with ThreadPoolExecutor(max_workers=threads) as executor:
futures = [executor.submit(validate, i) for i in fastq_list]
for future in as_completed(futures):
progress.update(task_progress, advance=1)
with harpy_progressbar(quiet) as progress, ThreadPoolExecutor(max_workers=threads) as executor:
task_progress = progress.add_task("[green]Validating FASTQ inputs", total=len(fastq_list))
futures = [executor.submit(validate, i) for i in fastq_list]
for future in as_completed(futures):
progress.update(task_progress, advance=1)

def validate_barcodefile(infile, return_len = False):
"""Does validations to make sure it's one length, one per line, and nucleotides"""
def validate_barcodefile(infile, return_len = False, quiet = False, limit = None):
#TODO DUPLICATE CHECK
"""Does validations to make sure it's one length, within a length limit, one per line, and nucleotides"""
if is_gzip(infile):
print_error("Incorrect format", f"The input file must be in uncompressed format. Please decompress [blue]{infile}[/blue] and try again.")
sys.exit(1)
lengths = set()
nucleotides = {'A','C','G','T'}
line_num = 0
with open(infile, "r") as bc_file:
for line in bc_file:
line_num += 1
barcode = line.rstrip()
if len(barcode.split()) > 1:
print_error("Incorrect format", f"There must be one barcode per line, but multiple entries were detected on [bold]line {line_num}[/bold] in [blue]{infile}[/blue]")
sys.exit(1)
if not set(barcode).issubset(nucleotides) or barcode != barcode.upper():
print_error("Incorrect format", f"Invalid barcode format on [bold]line {line_num}[/bold]: [yellow]{barcode}[/yellow].\nBarcodes in [blue]{infile}[/blue] must be captial letters and only contain standard nucleotide characters [green]ATCG[/green].")
def validate(line_num, bc_line):
barcode = bc_line.rstrip()
if len(barcode.split()) > 1:
print_error("Incorrect format", f"There must be one barcode per line, but multiple entries were detected on [bold]line {line_num}[/bold] in [blue]{infile}[/blue]")
sys.exit(1)
if not set(barcode).issubset(nucleotides) or barcode != barcode.upper():
print_error("Incorrect format", f"Invalid barcode format on [bold]line {line_num }[/bold]: [yellow]{barcode}[/yellow].\nBarcodes in [blue]{infile}[/blue] must be captial letters and only contain standard nucleotide characters [green]ATCG[/green].")
sys.exit(1)
return len(barcode)

with open(infile, "r") as bc_file, harpy_progressbar(quiet) as progress:
out = subprocess.Popen(['wc', '-l', infile], stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()[0]
linenum = int(out.partition(b' ')[0])
task_progress = progress.add_task("[green]Validating barcodes", total=linenum)
for line,bc in enumerate(bc_file):
length = validate(line + 1, bc)
if limit and length > limit:
print_error("Barcodes too long", f"Barcodes in [blue]{infile}[/blue] are [yellow]{length}bp[/yellow] and cannot exceed a length of [bold]{limit}bp[/bold]. Please use shorter barcodes (it's a limitation of LRSIM).")
sys.exit(1)
lengths.add(len(barcode))
lengths.add(length)
progress.update(task_progress, advance=1)
if len(lengths) > 1:
print_error("Incorrect format", f"Barcodes in [blue]{infile}[/blue] must all be a single length, but multiple lengths were detected: [yellow]" + ", ".join(lengths))
sys.exit(1)
pdimens marked this conversation as resolved.
Show resolved Hide resolved
if return_len:
return lengths.pop()
return lengths.pop()
2 changes: 1 addition & 1 deletion harpy/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de
if contigs:
fasta_contig_match(contigs, genome)
if barcode_list:
validate_barcodefile(barcode_list)
validate_barcodefile(barcode_list, False, quiet)
fetch_rule(workflowdir, "align_ema.smk")
fetch_report(workflowdir, "align_stats.Rmd")
fetch_report(workflowdir, "align_bxstats.Rmd")
Expand Down
Loading
Loading