From d789350bae13170f617b8b393361114676d613b3 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Tue, 15 Feb 2022 11:27:02 +0000 Subject: [PATCH] patch to fix issue with uppercase gene names, also found bug that led to demultiplexed reads with latest versions of guppy not being recognised. --- apollo/__init__.py | 2 +- apollo/command.py | 6 ++++-- apollo/scripts/apollofunks.py | 8 +++++--- apollo/scripts/paramether.py | 17 ++++++++--------- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/apollo/__init__.py b/apollo/__init__.py index 2f8befa..11a3a75 100644 --- a/apollo/__init__.py +++ b/apollo/__init__.py @@ -1,2 +1,2 @@ _program = "apollo" -__version__ = "0.2" +__version__ = "0.2.1" diff --git a/apollo/command.py b/apollo/command.py index 3514b4d..5f87b96 100644 --- a/apollo/command.py +++ b/apollo/command.py @@ -83,14 +83,16 @@ def main(sysargs = sys.argv[1:]): """ Get outdir, tempdir and the data """ + # get data for a particular species, and get species + qcfunk.get_package_data(thisdir, args.species, config) + # default output dir qcfunk.get_outdir(args.outdir,args.output_prefix,cwd,config) # specifying temp directory, outdir if no_temp (tempdir becomes working dir) tempdir = qcfunk.get_temp_dir(args.tempdir, args.no_temp,cwd,config) - # get data for a particular species, and get species - qcfunk.get_package_data(thisdir, args.species, config) + config["cpg_header"] = qcfunk.make_cpg_header(config["cpg_sites"]) diff --git a/apollo/scripts/apollofunks.py b/apollo/scripts/apollofunks.py index ae0d509..c291edb 100644 --- a/apollo/scripts/apollofunks.py +++ b/apollo/scripts/apollofunks.py @@ -98,7 +98,7 @@ def make_timestamped_outdir(cwd,outdir,config): output_prefix = '_'.join(split_prefix[:-1]) config["output_prefix"] = output_prefix species = config["species"] - timestamp = str(datetime.now().isoformat(timespec='milliseconds')).replace(":","").replace(".","").replace("T","-") + timestamp = str(datetime.now().isoformat(timespec='seconds')).replace(":","").replace(".","").replace("T","-") outdir = os.path.join(cwd, f"{output_prefix}_{species}_{timestamp}") rel_outdir = os.path.join(".",timestamp) @@ -198,10 +198,12 @@ def get_package_data(thisdir,species_arg, config): def look_for_guppy_barcoder(demultiplex_arg,path_to_guppy_arg,cwd,config): + add_arg_to_config("demultiplex", demultiplex_arg, config) add_arg_to_config("path_to_guppy", path_to_guppy_arg, config) if config["demultiplex"]: + if config["path_to_guppy"]: expanded_path = os.path.expanduser(config["path_to_guppy"]) if config["path_to_guppy"].endswith("guppy_barcoder"): @@ -279,10 +281,10 @@ def look_for_barcodes_csv(barcodes_csv_arg,cwd,config): sys.stderr.write(f"Error: Barcode file missing header field `barcode`\n") sys.exit(-1) for row in reader: - if row["barcode"].startswith("NB") or row["barcode"].startswith("BC"): + if row["barcode"].startswith("NB") or row["barcode"].startswith("BC") or row["barcode"].startswith("barcode"): barcodes.append(row["barcode"]) else: - sys.stderr.write(f"Error: Please provide barcodes in the format `NB01` or `BC01`\n") + sys.stderr.write(f"Error: Please provide barcodes in the format `NB01`, `BC01` or `barcode01`\n") sys.exit(-1) print(f"{len(barcodes)} barcodes read in from file") diff --git a/apollo/scripts/paramether.py b/apollo/scripts/paramether.py index 79bc4c2..4e47091 100644 --- a/apollo/scripts/paramether.py +++ b/apollo/scripts/paramether.py @@ -34,6 +34,7 @@ def get_best_reference(query, ref_dict, matrix): new_reference_alignment = align_read(query, ref, ref_dict[ref], matrix) if new_reference_alignment["coverage"] > 0.7: + if best_reference_alignment["identity"] < new_reference_alignment["identity"]: best_reference_alignment = new_reference_alignment else: @@ -74,7 +75,7 @@ def process_file(reads,references,cpg_dict,sample,cpg_counter,nuc_matrix): stats = get_best_reference(str(record.seq), references, nuc_matrix) best_ref,direction = stats["reference"].split("_") - + background_error_rate = get_background_error_rate(stats) alignment_covers = int(stats["aln_len"]) / int(stats["len"]) # doesn't account for gaps so can be > 1 @@ -87,21 +88,18 @@ def process_file(reads,references,cpg_dict,sample,cpg_counter,nuc_matrix): else: read_seq = str(record.seq.reverse_complement()) ref_seq = references[best_ref + "_forward"] - + alignment = align_read(read_seq, best_ref, ref_seq, nuc_matrix) - for site in cpg_dict[best_ref]: - read_variant = get_site(site[1],alignment) + cpg_counter[site[0]][read_variant]+=1 - counts[best_ref]+=1 else: counts["None"]+=1 - return counts, cpg_counter @@ -133,7 +131,7 @@ def get_site(cpg_index, stats): def load_cpg_dict(cpg_csv): cpg_dict= collections.defaultdict(list) - with open(str(cpg_csv),"r") as f: + with open(cpg_csv,"r") as f: cpg_file = csv.DictReader(f) for row in cpg_file: position = int(row["position"]) - 1 @@ -152,8 +150,9 @@ def load_reference_dict(ref_file): references = {} for record in SeqIO.parse(ref_file, "fasta"): - references[record.id + "_forward"] = str(record.seq) - references[record.id + "_reverse"] = str(record.seq.reverse_complement()) + lower_id = str(record.id).lower() + references[lower_id+ "_forward"] = str(record.seq) + references[lower_id + "_reverse"] = str(record.seq.reverse_complement()) return references if __name__ == '__main__':