Skip to content

Commit

Permalink
Comments update and Spull meth/pA flags added
Browse files Browse the repository at this point in the history
  • Loading branch information
Psy-Fer committed Mar 20, 2019
1 parent 23a05f3 commit 2a87258
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 82 deletions.
57 changes: 24 additions & 33 deletions MotifSeq.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
Garvan Institute
Copyright 2018
SigTools - a nonopore raw signal toolkit
MotifSeq - finding signal motifs in raw nanopore signal
--------------------------------------------------------------------------------------
version 0.0 - initial
Expand All @@ -32,6 +32,13 @@
- Move methods of data processing into yield based functions
- ensure all args removed from functions
- make callable from other scripts
- Extra plotting options - comparing signals
- E value for matches
- Able to take different models
- add in scrappie API for model building
- adapter search values as an argument
- integration with segmenter
- Take any signal format based on headers
-----------------------------------------------------------------------------
MIT License
Expand Down Expand Up @@ -67,7 +74,7 @@ def error(self, message):

def main():
'''
do the thing
Main function for executing logic based on file input types
'''
parser = MyParser(
description="MotifSeq - the Ctrl+f for signal. Signal-level local alignment of sequence motifs.")
Expand Down Expand Up @@ -142,7 +149,7 @@ def main():
print >> sys.stderr, "Failed to extract signal", path, fast5
continue
sig = np.array(sig, dtype=int)
sig = scale_outliers(sig, args.scale_hi, args.scale_low)
sig = scale_outliers(sig, args)
sig = sklearn.preprocessing.scale(sig,
axis=0,
with_mean=True,
Expand Down Expand Up @@ -173,7 +180,7 @@ def main():
print >> sys.stderr, "main():data not extracted. Moving to next file", fast5_file
continue
sig = np.array(sig, dtype=int)
sig = scale_outliers(sig, args.scale_hi, args.scale_low)
sig = scale_outliers(sig, args)
sig = sklearn.preprocessing.scale(sig,
axis=0,
with_mean=True,
Expand Down Expand Up @@ -212,7 +219,7 @@ def main():
if not sig.any():
print >> sys.stderr, "nope 1"
continue
sig = scale_outliers(sig, args.scale_hi, args.scale_low)
sig = scale_outliers(sig, args)
sig = sklearn.preprocessing.scale(sig,
axis=0,
with_mean=True,
Expand Down Expand Up @@ -247,14 +254,14 @@ def dicSwitch(i):
return open_method[i]


def scale_outliers(squig, hi, low):
''' Scale outliers to within m stdevs of median '''
ret = []
for i in squig:
if i > hi or i < low:
continue
ret.append(i)
return np.array(ret, dtype=int)
def scale_outliers(squig, args):
'''
Remove outliers based on hi/low args.
I was scaling at one point, but removing tends to be less problematic
This can change the position co-ordinates a little
'''
k = (squig > args.scale_low) & (squig < args.scale_hi)
return squig[k]


def process_fast5(path):
Expand All @@ -272,7 +279,6 @@ def process_fast5(path):
return squig
# extract raw signal
try:
#b = sorted([i for i in hdf['Analyses'].keys() if i[0] == 'B'])[-1]
c = hdf['Raw/Reads'].keys()
for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]:
squig.append(int(col))
Expand All @@ -285,7 +291,7 @@ def process_fast5(path):

def read_synth_model(filename):
'''
read squiggle data ready for dtw
read squiggle data from scrappie, old and new version, ready for dtw
'''
dic = {}
with open(filename, 'r') as r:
Expand All @@ -305,7 +311,7 @@ def read_synth_model(filename):

def read_bait_model(filename):
'''
read baited signal file
read baited signal file - Not currently in use
'''
dic = {}
if filename.endswith('.gz'):
Expand All @@ -329,6 +335,7 @@ def get_baits(args, sig, start, end, adapter_pos):
"""
logic to get baits from candidate read by adjusting bountaries, and locking
in a section to be pushed to a file
Not currently in use
"""
# loop on key press to modify boundaries and Visualise

Expand All @@ -341,7 +348,7 @@ def get_baits(args, sig, start, end, adapter_pos):

def get_segs(segfile):
'''
create segment dic
create segment dic from segmenter output
'''
dic = {}
with open(segfile, 'r') as f2:
Expand Down Expand Up @@ -374,13 +381,6 @@ def get_adapter(args, sig, adapter, fast5):
if args.view:
view_adapter(sig, start, end)

# if test_adapter(start):
# return True
# else:
# return False

# pos = [dist, start, end]
# return pos


def get_adapter_2(args, sig, adapter, segs, fast5):
Expand All @@ -404,14 +404,6 @@ def get_adapter_2(args, sig, adapter, segs, fast5):
if args.view:
view_adapter(sig, start, end, s=segs)

# if test_adapter(start - segs[1]):
# return True
# else:
# return False
#
# pos = [dist, start, end]
# return pos


def view_adapter(sig, start, end, s=False):
'''
Expand Down Expand Up @@ -468,7 +460,6 @@ def get_region(args, sig, model, fast5):
view_region(sig, start, end, cost, path, model)
return

# def view_region(sig, start, end, s=False):


def view_region(sig, start, end, cost, path, model, s=False):
Expand Down
6 changes: 4 additions & 2 deletions SquigglePlot.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
TODO:
- change size of saved figure
- yaml/config file for multiple plots
- Default plot settings rolled out tookit wide
- light smoothing
-----------------------------------------------------------------------------
MIT License
Expand Down Expand Up @@ -69,10 +71,10 @@ def error(self, message):

def main():
'''
do the thing
Main function for executing logic based on file input types
'''
parser = MyParser(
description="script name - script description")
description="SquigglePlot - plotting the raw signal data")
group = parser.add_mutually_exclusive_group()
group.add_argument("-f", "--f5f",
help="File list of fast5 paths")
Expand Down
116 changes: 87 additions & 29 deletions SquigglePull.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
Garvan Institute
Copyright 2017
Pull squiggle data from fast5 files given some targetting method
Pull squiggle data from fast5 files
input:
- path to fast5 files
Expand All @@ -23,17 +23,13 @@
- tsv signal file
TODO:
- get meth parameters, for Hasindu
- Dynamic columns and data types
- Mult fast5 file support
- paf, sam, fastq, or flat file support for filtering
- multiprocessing
- use # notation at start of file for static values, size reduction,
including things like kits, flowcells, versions, etc, for comparisons.
// get channel parameters
const char* scaling_path = "/UniqueGlobalKey/channel_id";
hid_t scaling_group = H5Gopen(hdf5_file, scaling_path, H5P_DEFAULT);
digitisation =
fast5_read_float_attribute(scaling_group, "digitisation");
offset = fast5_read_float_attribute(scaling_group, "offset");
range = fast5_read_float_attribute(scaling_group, "range");
sample_rate =fast5_read_float_attribute(scaling_group, "sampling_rate");
Testing:
Expand Down Expand Up @@ -100,6 +96,10 @@ def main():
help="Engage higher output verbosity")
parser.add_argument("-s", "--scale", action="store_true",
help="Scale signal output for comparison")
parser.add_argument("-c", "--pA_convert", action="store_true",
help="Convert raw signal to pA, for comparisons")
parser.add_argument("-m", "--meth", action="store_true",
help="Print extra information used in methylation calling - nanopolish/f5c")
# parser.add_argument("-a", "--paf",
# help="paf alignment file for nt approach - Benchmarking")
args = parser.parse_args()
Expand Down Expand Up @@ -138,11 +138,27 @@ def main():
print '{}\t{}\t{}\t{}\t{}'.format(
fast5, region[0], region[1], region[2], '\t'.join(ar))
elif args.raw:
ar = []
for i in region[2]:
ar.append(str(i))
print '{}\t{}\t{}\t{}'.format(
fast5, region[0], region[1], '\t'.join(ar))
if args.pA_convert:
# convert signal to pA
pA_sig = convert_to_pA(data)
ar = []
for i in pA_sig:
ar.append(str(i))
print '{}\t{}\t{}\t{}\t{}'.format(
fast5, data['readID'], region[0], region[1], '\t'.join(ar))
elif args.meth:
ar = []
for i in region[2]:
ar.append(str(i))
print '{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(fast5, data['readID'],
data['digitisation'], data['offset'], data['range'],
data['sampling_rate'], '\t'.join(ar))
else:
ar = []
for i in region[2]:
ar.append(str(i))
print '{}\t{}\t{}\t{}\t{}'.format(
fast5, data['readID'], region[0], region[1], '\t'.join(ar))
if args.verbose:
end_time = time.time() - start_time
print >> sys.stderr, "Time taken:", end_time
Expand All @@ -168,7 +184,9 @@ def extract_f5(filename, args, batch=False):
Takes the latest basecalled events table.
'''

f5_dic = {'raw': [], 'events': [], 'seq': '', 'seq_id': ''}
f5_dic = {'raw': [], 'events': [], 'seq': '', 'readID': '',
'digitisation': 0.0, 'offset': 0.0, 'range': 0.0,
'sampling_rate': 0.0}

# open fast5 file
try:
Expand All @@ -189,7 +207,7 @@ def extract_f5(filename, args, batch=False):
fq = hdf['Analyses'][b]['BaseCalled_template']['Fastq'][()
].split('\n')
f5_dic['seq'] = fq
f5_dic['seq_id'] = fq[0].split(' ')[0].split('_')[0][1:]
f5_dic['readID'] = hdf['Raw/Reads/'][c[0]].attrs['read_id']

except:
traceback.print_exc()
Expand All @@ -198,15 +216,36 @@ def extract_f5(filename, args, batch=False):

# extract raw signal
elif args.raw:
try:
c = hdf['Raw/Reads'].keys()
for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]:
f5_dic['raw'].append(int(col))
if args.pA_convert or args.meth:
try:
c = hdf['Raw/Reads'].keys()
for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]:
f5_dic['raw'].append(int(col))

except:
traceback.print_exc()
print >> sys.stderr, 'extract_fast5():failed to extract events or fastq from', filename
f5_dic = {}

f5_dic['readID'] = hdf['Raw/Reads/'][c[0]].attrs['read_id']
f5_dic['digitisation'] = hdf['UniqueGlobalKey/channel_id'].attrs['digitisation']
f5_dic['offset'] = hdf['UniqueGlobalKey/channel_id'].attrs['offset']
f5_dic['range'] = float("{0:.2f}".format(hdf['UniqueGlobalKey/channel_id'].attrs['range']))
f5_dic['sampling_rate'] = hdf['UniqueGlobalKey/channel_id'].attrs['sampling_rate']

except:
traceback.print_exc()
print >> sys.stderr, 'extract_fast5():failed to extract events or fastq from', filename
f5_dic = {}

else:
try:
c = hdf['Raw/Reads'].keys()
for col in hdf['Raw/Reads/'][c[0]]['Signal'][()]:
f5_dic['raw'].append(int(col))

f5_dic['readID'] = hdf['Raw/Reads/'][c[0]].attrs['read_id']

except:
traceback.print_exc()
print >> sys.stderr, 'extract_fast5():failed to extract events or fastq from', filename
f5_dic = {}

# signal flag not set
else:
Expand All @@ -230,7 +269,7 @@ def pull_target(data, args, min_length=50, paf=None):
Returns:
- Regions of interest labelled by read_id/filename
dicf5_dic = {'events': [], 'moves': [], 'seq': '', 'seq_id': ''}
dicf5_dic = {'events': [], 'moves': [], 'seq': '', 'readID': ''}
'''
default = []
region = []
Expand All @@ -248,7 +287,7 @@ def pull_target(data, args, min_length=50, paf=None):
if args.scale:
signal = scale_data(signal)

region.append(data['seq_id'])
region.append(data['readID'])
region.append(target)
region.append(target_type)
region.append(signal)
Expand All @@ -262,7 +301,7 @@ def pull_target(data, args, min_length=50, paf=None):
signal = scale_data(signal)
target = str(len(signal))

#region.append(data['seq_id'])
#region.append(data['readID'])
region.append(target)
region.append(target_type)
region.append(signal)
Expand Down Expand Up @@ -295,5 +334,24 @@ def scale_data(data):
return scaled_data


def convert_to_pA(d):
'''
convert raw signal data to pA using digitisation, offset, and range
float raw_unit = range / digitisation;
for (int32_t j = 0; j < nsample; j++) {
rawptr[j] = (rawptr[j] + offset) * raw_unit;
}
'''
digitisation = d['digitisation']
range = d['range']
offset = d['offset']
raw_unit = range / digitisation
new_raw = []
for i in d['raw']:
j = (i + offset) * raw_unit
new_raw.append("{0:.2f}".format(round(j,2)))
return new_raw


if __name__ == '__main__':
main()
Loading

0 comments on commit 2a87258

Please sign in to comment.