patch to fix issue with uppercase gene names, also found bug that led…

… to demultiplexed reads with latest versions of guppy not being recognised.
WildAnimalClocks · Feb 15, 2022 · d789350 · d789350
1 parent d9a36db
commit d789350
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 15 deletions.
diff --git a/apollo/__init__.py b/apollo/__init__.py
@@ -1,2 +1,2 @@
 _program = "apollo"
-__version__ = "0.2"
+__version__ = "0.2.1"
diff --git a/apollo/command.py b/apollo/command.py
@@ -83,14 +83,16 @@ def main(sysargs = sys.argv[1:]):
     """
     Get outdir, tempdir and the data
     """
+    # get data for a particular species, and get species
+    qcfunk.get_package_data(thisdir, args.species, config)
+
     # default output dir
     qcfunk.get_outdir(args.outdir,args.output_prefix,cwd,config)
 
     # specifying temp directory, outdir if no_temp (tempdir becomes working dir)
     tempdir = qcfunk.get_temp_dir(args.tempdir, args.no_temp,cwd,config)
 
-    # get data for a particular species, and get species
-    qcfunk.get_package_data(thisdir, args.species, config)
+
 
     config["cpg_header"] = qcfunk.make_cpg_header(config["cpg_sites"])
 

diff --git a/apollo/scripts/apollofunks.py b/apollo/scripts/apollofunks.py
@@ -98,7 +98,7 @@ def make_timestamped_outdir(cwd,outdir,config):
         output_prefix = '_'.join(split_prefix[:-1])
     config["output_prefix"] = output_prefix
     species = config["species"]
-    timestamp = str(datetime.now().isoformat(timespec='milliseconds')).replace(":","").replace(".","").replace("T","-")
+    timestamp = str(datetime.now().isoformat(timespec='seconds')).replace(":","").replace(".","").replace("T","-")
     outdir = os.path.join(cwd, f"{output_prefix}_{species}_{timestamp}")
     rel_outdir = os.path.join(".",timestamp)
 
@@ -198,10 +198,12 @@ def get_package_data(thisdir,species_arg, config):
 
 
 def look_for_guppy_barcoder(demultiplex_arg,path_to_guppy_arg,cwd,config):
+
     add_arg_to_config("demultiplex", demultiplex_arg, config)
     add_arg_to_config("path_to_guppy", path_to_guppy_arg, config)
 
     if config["demultiplex"]:
+
         if config["path_to_guppy"]:
             expanded_path = os.path.expanduser(config["path_to_guppy"])
             if config["path_to_guppy"].endswith("guppy_barcoder"):
@@ -279,10 +281,10 @@ def look_for_barcodes_csv(barcodes_csv_arg,cwd,config):
                 sys.stderr.write(f"Error: Barcode file missing header field `barcode`\n")
                 sys.exit(-1)
             for row in reader: 
-                if row["barcode"].startswith("NB") or row["barcode"].startswith("BC"):
+                if row["barcode"].startswith("NB") or row["barcode"].startswith("BC") or row["barcode"].startswith("barcode"):
                     barcodes.append(row["barcode"])
                 else:
-                    sys.stderr.write(f"Error: Please provide barcodes in the format `NB01` or `BC01`\n")
+                    sys.stderr.write(f"Error: Please provide barcodes in the format `NB01`, `BC01` or `barcode01`\n")
                     sys.exit(-1)
 
             print(f"{len(barcodes)} barcodes read in from file")

diff --git a/apollo/scripts/paramether.py b/apollo/scripts/paramether.py
@@ -34,6 +34,7 @@ def get_best_reference(query, ref_dict, matrix):
 
         new_reference_alignment = align_read(query, ref, ref_dict[ref], matrix)
         if new_reference_alignment["coverage"] > 0.7:
+
             if best_reference_alignment["identity"] < new_reference_alignment["identity"]: 
                 best_reference_alignment = new_reference_alignment
             else:
@@ -74,7 +75,7 @@ def process_file(reads,references,cpg_dict,sample,cpg_counter,nuc_matrix):
 
         stats = get_best_reference(str(record.seq), references, nuc_matrix)
         best_ref,direction = stats["reference"].split("_")
-
+        
         background_error_rate = get_background_error_rate(stats)
 
         alignment_covers = int(stats["aln_len"]) / int(stats["len"]) # doesn't account for gaps so can be > 1
@@ -87,21 +88,18 @@ def process_file(reads,references,cpg_dict,sample,cpg_counter,nuc_matrix):
             else:
                 read_seq = str(record.seq.reverse_complement())
             ref_seq = references[best_ref + "_forward"]
-            
+
             alignment = align_read(read_seq, best_ref, ref_seq, nuc_matrix)
-
             for site in cpg_dict[best_ref]:
-
                 read_variant = get_site(site[1],alignment)
+
                 cpg_counter[site[0]][read_variant]+=1
 
-
             counts[best_ref]+=1
 
         else:
             counts["None"]+=1
 
-
     return counts, cpg_counter
 
 
@@ -133,7 +131,7 @@ def get_site(cpg_index, stats):
 
 def load_cpg_dict(cpg_csv):
     cpg_dict= collections.defaultdict(list)
-    with open(str(cpg_csv),"r") as f:
+    with open(cpg_csv,"r") as f:
         cpg_file = csv.DictReader(f)
         for row in cpg_file:
             position = int(row["position"]) - 1
@@ -152,8 +150,9 @@ def load_reference_dict(ref_file):
 
     references = {}
     for record in SeqIO.parse(ref_file, "fasta"):
-         references[record.id + "_forward"] = str(record.seq)
-         references[record.id + "_reverse"] = str(record.seq.reverse_complement())
+        lower_id = str(record.id).lower()
+        references[lower_id+ "_forward"] = str(record.seq)
+        references[lower_id + "_reverse"] = str(record.seq.reverse_complement())
     return references
 
 if __name__ == '__main__':