Skip to content

Commit

Permalink
patch to fix issue with uppercase gene names, also found bug that led…
Browse files Browse the repository at this point in the history
… to demultiplexed reads with latest versions of guppy not being recognised.
  • Loading branch information
aineniamh committed Feb 15, 2022
1 parent d9a36db commit d789350
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 15 deletions.
2 changes: 1 addition & 1 deletion apollo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
_program = "apollo"
__version__ = "0.2"
__version__ = "0.2.1"
6 changes: 4 additions & 2 deletions apollo/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,16 @@ def main(sysargs = sys.argv[1:]):
"""
Get outdir, tempdir and the data
"""
# get data for a particular species, and get species
qcfunk.get_package_data(thisdir, args.species, config)

# default output dir
qcfunk.get_outdir(args.outdir,args.output_prefix,cwd,config)

# specifying temp directory, outdir if no_temp (tempdir becomes working dir)
tempdir = qcfunk.get_temp_dir(args.tempdir, args.no_temp,cwd,config)

# get data for a particular species, and get species
qcfunk.get_package_data(thisdir, args.species, config)


config["cpg_header"] = qcfunk.make_cpg_header(config["cpg_sites"])

Expand Down
8 changes: 5 additions & 3 deletions apollo/scripts/apollofunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def make_timestamped_outdir(cwd,outdir,config):
output_prefix = '_'.join(split_prefix[:-1])
config["output_prefix"] = output_prefix
species = config["species"]
timestamp = str(datetime.now().isoformat(timespec='milliseconds')).replace(":","").replace(".","").replace("T","-")
timestamp = str(datetime.now().isoformat(timespec='seconds')).replace(":","").replace(".","").replace("T","-")
outdir = os.path.join(cwd, f"{output_prefix}_{species}_{timestamp}")
rel_outdir = os.path.join(".",timestamp)

Expand Down Expand Up @@ -198,10 +198,12 @@ def get_package_data(thisdir,species_arg, config):


def look_for_guppy_barcoder(demultiplex_arg,path_to_guppy_arg,cwd,config):

add_arg_to_config("demultiplex", demultiplex_arg, config)
add_arg_to_config("path_to_guppy", path_to_guppy_arg, config)

if config["demultiplex"]:

if config["path_to_guppy"]:
expanded_path = os.path.expanduser(config["path_to_guppy"])
if config["path_to_guppy"].endswith("guppy_barcoder"):
Expand Down Expand Up @@ -279,10 +281,10 @@ def look_for_barcodes_csv(barcodes_csv_arg,cwd,config):
sys.stderr.write(f"Error: Barcode file missing header field `barcode`\n")
sys.exit(-1)
for row in reader:
if row["barcode"].startswith("NB") or row["barcode"].startswith("BC"):
if row["barcode"].startswith("NB") or row["barcode"].startswith("BC") or row["barcode"].startswith("barcode"):
barcodes.append(row["barcode"])
else:
sys.stderr.write(f"Error: Please provide barcodes in the format `NB01` or `BC01`\n")
sys.stderr.write(f"Error: Please provide barcodes in the format `NB01`, `BC01` or `barcode01`\n")
sys.exit(-1)

print(f"{len(barcodes)} barcodes read in from file")
Expand Down
17 changes: 8 additions & 9 deletions apollo/scripts/paramether.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def get_best_reference(query, ref_dict, matrix):

new_reference_alignment = align_read(query, ref, ref_dict[ref], matrix)
if new_reference_alignment["coverage"] > 0.7:

if best_reference_alignment["identity"] < new_reference_alignment["identity"]:
best_reference_alignment = new_reference_alignment
else:
Expand Down Expand Up @@ -74,7 +75,7 @@ def process_file(reads,references,cpg_dict,sample,cpg_counter,nuc_matrix):

stats = get_best_reference(str(record.seq), references, nuc_matrix)
best_ref,direction = stats["reference"].split("_")

background_error_rate = get_background_error_rate(stats)

alignment_covers = int(stats["aln_len"]) / int(stats["len"]) # doesn't account for gaps so can be > 1
Expand All @@ -87,21 +88,18 @@ def process_file(reads,references,cpg_dict,sample,cpg_counter,nuc_matrix):
else:
read_seq = str(record.seq.reverse_complement())
ref_seq = references[best_ref + "_forward"]

alignment = align_read(read_seq, best_ref, ref_seq, nuc_matrix)

for site in cpg_dict[best_ref]:

read_variant = get_site(site[1],alignment)

cpg_counter[site[0]][read_variant]+=1


counts[best_ref]+=1

else:
counts["None"]+=1


return counts, cpg_counter


Expand Down Expand Up @@ -133,7 +131,7 @@ def get_site(cpg_index, stats):

def load_cpg_dict(cpg_csv):
cpg_dict= collections.defaultdict(list)
with open(str(cpg_csv),"r") as f:
with open(cpg_csv,"r") as f:
cpg_file = csv.DictReader(f)
for row in cpg_file:
position = int(row["position"]) - 1
Expand All @@ -152,8 +150,9 @@ def load_reference_dict(ref_file):

references = {}
for record in SeqIO.parse(ref_file, "fasta"):
references[record.id + "_forward"] = str(record.seq)
references[record.id + "_reverse"] = str(record.seq.reverse_complement())
lower_id = str(record.id).lower()
references[lower_id+ "_forward"] = str(record.seq)
references[lower_id + "_reverse"] = str(record.seq.reverse_complement())
return references

if __name__ == '__main__':
Expand Down

0 comments on commit d789350

Please sign in to comment.