Skip to content

Commit

Permalink
Comments updated
Browse files Browse the repository at this point in the history
  • Loading branch information
Psy-Fer committed Mar 17, 2019
1 parent 94154ac commit 23a05f3
Showing 1 changed file with 27 additions and 5 deletions.
32 changes: 27 additions & 5 deletions fast5_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@
- autobuild index file - make it a sub script as well
- Consider using csv.DictReader() instead of wheel building
- options to build new index of fetched fast5s
- Add in verbosity for stderr outputs
- Add sam file compatibility for filtering
- Do extra checks for \r with mac/windows files
- Memory reduction using hashed binary indexes of larger files
- individual file flag
- Multi-fast5 support
- multiprocessing
-----------------------------------------------------------------------------
MIT License
Expand All @@ -48,7 +55,6 @@
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
MyParser
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
Expand All @@ -71,7 +77,7 @@ def error(self, message):

def main():
'''
do the thing
Main function to control each major control function.
'''
parser = MyParser(
description="fast_fetcher - extraction of specific nanopore fast5 files")
Expand Down Expand Up @@ -107,12 +113,16 @@ def main():
parser.print_help(sys.stderr)
sys.exit(1)

# TODO: hide with verbosity
print >> sys.stderr, "Starting things up!"

p_dic = {}
if args.pppp:
# TODO: hide with verbosity
print >> sys.stderr, "PPPP state! Not extracting, exporting tar commands"


# Handles the trimmming of fastq or sequencing summary files depending on input
trim_pass = False
if args.trim:
SS = False
Expand All @@ -134,14 +144,15 @@ def main():
if args.seq_sum:
SS = args.seq_sum

# final check
# final check for trimming
if FQ and SS:
trim_pass = True
print >> sys.stderr, "Trim setting detected. Writing to working direcory"
else:
print >> sys.stderr, "Unable to verify both fastq and sequencing_summary files. Please check filenames and try again. Exiting..."
sys.exit()

# Do the actual trimming depending on input type - non destructive, so can do it first
ids = []
if args.fastq:
ids = get_fq_reads(args.fastq)
Expand All @@ -162,9 +173,11 @@ def main():
filenames, ids = get_filenames(args.seq_sum, ids)

paths = get_paths(args.index, filenames)
# TODO: hide with verbosity
print >> sys.stderr, "extracting..."
# place multiprocessing pool here
# TODO: place multiprocessing pool here
for p, f in paths:
# if -z option, get file paths for command lists
if args.pppp:
if p in p_dic:
p_dic[p].append(f)
Expand All @@ -180,6 +193,7 @@ def main():
# For each .tar file, write a file with the tarball name as filename.tar.txt
# and contains a list of files to extract - input for batch_tater.py
if args.pppp:
# TODO: check for naming and dynamically make tater_master filename
with open("tater_master.txt", 'w') as m:
for i in p_dic:
fname = "tater_" + i.split('/')[-1] + ".txt"
Expand All @@ -191,7 +205,7 @@ def main():
for j in p_dic[i]:
f.write(j)
f.write('\n')

# TODO: hide with verbosity
print >> sys.stderr, "done!"


Expand All @@ -210,6 +224,7 @@ def get_fq_reads(fastq):
'''
read fastq file and extract read ids
quick and dirty to limit library requirements - still bullet fast
reads 4 lines at a time and on 1st line, split and get id, dropping the @ symbol
'''
c = 0
read_ids = set()
Expand All @@ -234,6 +249,7 @@ def get_fq_reads(fastq):
def get_paf_reads(reads):
'''
Parse paf file to pull read ids (from minimap2 alignment)
First field is the readID
'''
read_ids = set()
if reads.endswith('.gz'):
Expand All @@ -253,6 +269,7 @@ def get_paf_reads(reads):
def get_flat_reads(filename):
'''
Parse a flat file separated by line breaks \n
ie, one readID per line
TODO: make @ symbol check once, as they should all be the same
'''
read_ids = set()
Expand Down Expand Up @@ -345,6 +362,8 @@ def trim_both(args, ids, FQ, SS):
def get_filenames(seq_sum, ids):
'''
match read ids with seq_sum to pull filenames
uses set to remove possible duplicates, as well as faster "if in" checks
sets are hashed, lists are not, so O(N) vs O(1) complexity
'''
# for when using seq_sum for filtering, and not fq,paf,flat
ss_only = False
Expand Down Expand Up @@ -378,6 +397,7 @@ def get_filenames(seq_sum, ids):
def get_paths(index_file, filenames, f5=None):
'''
Read index and extract full paths for file extraction
This could be done better with byte indexing
'''
tar = False
paths = []
Expand Down Expand Up @@ -435,6 +455,8 @@ def extract_file(args, path, filename):
--transform not working on MacOS. Need to use gtar
Thanks to Kai Martin for picking that one up!
TODO: move OS detection to top, and use to help compatibility for edge cases.
works pretty well on all systems so far, but all about dat support!
'''
OSystem = ""
OSystem = args.OSystem
Expand Down

0 comments on commit 23a05f3

Please sign in to comment.