-
Notifications
You must be signed in to change notification settings - Fork 10
/
process_sra_directory.py
executable file
·53 lines (43 loc) · 1.52 KB
/
process_sra_directory.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python3
"""
This file performs multiple runs of the following:
1. Convert reads from SRA to FASTQ
2. Align reads with HISAT2, in single- or paired-end as appropriate
3. Map reads to genes
4. Convert counts to RPKM
5. Save RPKM and summary data
"""
from argparse import ArgumentParser
from pathlib import Path
import pandas as pd
from alignment import process_sra_file
from utils import add_common_command_line_arguments
SRA_PATTERN = '*.sra'
if __name__ == '__main__':
p = ArgumentParser()
p.add_argument(
'sra_directory',
type=Path,
help='Directory containing SRA files',
)
add_common_command_line_arguments(p)
args = p.parse_args()
all_rpkm = []
all_alignment_metadata = []
for sra_file in args.sra_directory.glob(SRA_PATTERN):
rpkm, alignment_metadata = process_sra_file(
sra_path=sra_file,
subprocesses=args.subprocesses,
hisat2_options=args.hisat2_options,
reference_path=args.reference_path
)
all_rpkm.append(rpkm)
all_alignment_metadata.append(alignment_metadata)
rpkm = pd.DataFrame(all_rpkm)
alignment_metadata = pd.DataFrame(all_alignment_metadata)
if args.output_file is None:
args.output_file = args.sra_directory / 'expr.hdf5'
print('Saving expression and alignment metadata to', args.output_file)
with pd.HDFStore(args.output_file) as store:
store['rpkm'] = pd.DataFrame(rpkm)
store['alignment_metadata'] = pd.DataFrame(alignment_metadata)