Skip to content

Commit

Permalink
added sam file reading option for binary flag 256
Browse files Browse the repository at this point in the history
  • Loading branch information
sodiumnitrate committed Dec 9, 2024
1 parent c68f9fd commit 8e6b449
Show file tree
Hide file tree
Showing 14 changed files with 72 additions and 16 deletions.
6 changes: 4 additions & 2 deletions src/include/sam_entry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@ class SamEntry{
int end_pos;

float alignment_score;
int bin_flag;
public:
SamEntry(std::string& rn, std::string& s, std::string& m, int st, int ed, float as);
SamEntry(std::string& rn, std::string& s, std::string& m, int st, int ed, float as, int bin_flag);
std::string get_read_name();
std::string get_seq_str();
std::string get_mapped_onto();
int get_start_pos();
int get_end_pos();
int get_length();
int get_binary_flag();
Sequence to_sequence();
};
};
3 changes: 3 additions & 0 deletions src/include/sam_file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class SamFile{
std::vector<int> end_indices = {-1};
std::vector<std::string> mapped_onto = {""};
float min_score = 0;
bool primary_map = false;

// if AS is not normalized, we need to use sequence lengths
bool normalized_score = true;
Expand Down Expand Up @@ -42,6 +43,8 @@ class SamFile{
int size();
void set_normalized_true();
void set_normalized_false();
void set_primary_map_true();
void set_primary_map_false();
bool get_normalized();
int get_seq_start();
int get_seq_end();
Expand Down
7 changes: 4 additions & 3 deletions src/include/sam_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ class SamFilter{
std::unordered_set<std::string> nameset;
std::unordered_map<std::string, std::vector<std::tuple<int, int> > > nuc_ranges;
bool contains_empty = false;
bool primary_map = false;
public:
SamFilter(std::vector<std::string>& names, std::vector<int>& starts, std::vector<int>& ends);
bool query(std::string& name, int start, int end);
SamFilter(std::vector<std::string>& names, std::vector<int>& starts, std::vector<int>& ends, bool pm);
bool query(std::string& name, int start, int end, int bin_flag);
bool check_name(std::string& name);
};
};
7 changes: 5 additions & 2 deletions src/sam_entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
namespace py = pybind11;

// constructor
SamEntry::SamEntry(std::string& rn, std::string& s, std::string& m, int st, int ed, float as)
SamEntry::SamEntry(std::string& rn, std::string& s, std::string& m, int st, int ed, float as, int bf)
{
read_name = rn;
sequence = s;
mapped_onto = m;
start_pos = st;
end_pos = ed;
alignment_score = as;
bin_flag = bf;
}

std::string SamEntry::get_read_name(){ return read_name; }
Expand All @@ -21,6 +22,7 @@ std::string SamEntry::get_mapped_onto(){ return mapped_onto; }
int SamEntry::get_start_pos(){ return start_pos; }
int SamEntry::get_end_pos(){ return end_pos; }
int SamEntry::get_length(){ return end_pos - start_pos + 1; }
int SamEntry::get_binary_flag(){ return bin_flag; }

Sequence SamEntry::to_sequence() {
Sequence result(sequence);
Expand All @@ -30,13 +32,14 @@ Sequence SamEntry::to_sequence() {

void init_sam_entry(py::module_ &m){
py::class_<SamEntry>(m, "SamEntry", py::dynamic_attr())
.def(py::init<std::string&, std::string&, std::string&, int, int, float >())
.def(py::init<std::string&, std::string&, std::string&, int, int, float, int >())
.def("get_length", &SamEntry::get_length)
.def("get_read_name", &SamEntry::get_read_name)
.def("get_seq_str", &SamEntry::get_seq_str)
.def("get_mapped_onto", &SamEntry::get_mapped_onto)
.def("get_start_pos", &SamEntry::get_start_pos)
.def("get_end_pos", &SamEntry::get_end_pos)
.def("get_binary_flag", &SamEntry::get_binary_flag)
.def("__len__", &SamEntry::get_length)
;
}
17 changes: 12 additions & 5 deletions src/sam_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ int SamFile::get_seq_start() { return seq_start;}

int SamFile::get_seq_end() { return seq_end;}

void SamFile::set_primary_map_true(){primary_map = true;}

void SamFile::set_primary_map_false(){primary_map = false;}

// read in sequence lengths from a fasta file (for when normalization of alignment scores is needed)
void SamFile::get_lengths_from_fasta(std::string fasta_file_name){
SeqSet sset;
Expand All @@ -71,12 +75,12 @@ void SamFile::read(std::string file_name){
return;
}
std::string dummy, ref_name, seq, flag, seq_name;
int pos, length, bin_flag;
int pos, length, bin_flag, og_bin_flag;
float score;
std::string as_flag = "AS:i:";

// some preprocessing to facilitate filtering
SamFilter filter_obj(mapped_onto, start_indices, end_indices);
SamFilter filter_obj(mapped_onto, start_indices, end_indices, primary_map);

int idx = 0;

Expand All @@ -99,9 +103,10 @@ void SamFile::read(std::string file_name){
ss >> seq_name >> bin_flag >> ref_name >> pos >> dummy >> dummy >> dummy >> dummy >> dummy >> seq >> dummy;

auto length = seq.length();
og_bin_flag = bin_flag;

// apply filters----------------
if (!(filter_obj.query(ref_name, pos, pos + length - 1))) continue;
if (!(filter_obj.query(ref_name, pos, pos + length - 1, bin_flag))) continue;
// -----------------------------

// check alignment score
Expand Down Expand Up @@ -139,14 +144,14 @@ void SamFile::read(std::string file_name){
seq = seq_r.get_seq();
}

SamEntry entry(seq_name, seq, ref_name, pos, pos + length - 1, score);
SamEntry entry(seq_name, seq, ref_name, pos, pos + length - 1, score, og_bin_flag);
entries.push_back(entry);
seq_start = fmin(seq_start, pos);
seq_end = fmax(seq_end, pos + length);
}

std::cout << "Done reading. The final range is: (" << seq_start << ", " << seq_end << ")." << std::endl;

// re-acquire GIL
py::gil_scoped_acquire acquire;
}
Expand Down Expand Up @@ -365,5 +370,7 @@ void init_sam_file(py::module_ &m){
.def("get_multiplicity", &SamFile::get_multiplicity)
.def("to_seq_set", &SamFile::to_seq_set)
.def("get_seq_names", &SamFile::get_seq_names)
.def("set_primary_map_true", &SamFile::set_primary_map_true)
.def("set_primary_map_false", &SamFile::set_primary_map_false)
;
}
10 changes: 8 additions & 2 deletions src/sam_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
#include <iostream>
#include "include/sam_filter.hpp"

SamFilter::SamFilter(std::vector<std::string>& names, std::vector<int>& starts, std::vector<int>& ends){
SamFilter::SamFilter(std::vector<std::string>& names, std::vector<int>& starts, std::vector<int>& ends, bool pm){
// check input sizes
if (!(names.size() == starts.size() && starts.size() == ends.size())) throw "size mismatch in input";

unsigned int len = names.size();
primary_map = pm;

// process names
for (const auto& name : names){
Expand Down Expand Up @@ -94,12 +95,17 @@ SamFilter::SamFilter(std::vector<std::string>& names, std::vector<int>& starts,
}
}

bool SamFilter::query(std::string& name, int start, int end){
bool SamFilter::query(std::string& name, int start, int end, int bin_flag){
std::string curr_name = name;
if(nameset.find(name) == nameset.end()){
if (!contains_empty) return false;
else curr_name = "";
}

// check if primary alignment, if filter options require it to be so
bool not_primary = (bin_flag >> 8) & 1; // true if not primary alignment
if (not_primary && primary_map) return false;

int s, e;
bool works = false;
for (auto t : nuc_ranges.at(curr_name)){
Expand Down
1 change: 1 addition & 0 deletions tests/aux_files/sam_single.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SRR18071790.135115 256 CM042140.1 39405151 1 61M741N39M * 0 0 GACAGTTTTACTGAGAGGTCGGCATTCTCCCATGGTAACAACATCACCAATCTGAACATCCCTGAAACAGGGGCTCATGTGCACGGACATGTTTTTATGC AAFFFFAJJJJJJJJJJJJJJJJJ-FJFJJJJFJFJJJJJJJJJJFJJJJJJJJFJJJFFJAAFAJJJJFJFFJAFFAFFJJJJJJFJJJJFFJFJJAFF NH:i:4 HI:i:4 AS:i:98 nM:i:0
Binary file modified tests/aux_files/test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/aux_files/test_distances.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/aux_files/test_new_root.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/aux_files/test_node_added.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/aux_files/test_reroot_distances.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
22 changes: 21 additions & 1 deletion tests/test_sam_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

class TestSamEntry:
def test_init(self):
entry = SamEntry('b', 'b', 'b', 1, 2, 0.1)
entry = SamEntry('b', 'b', 'b', 1, 2, 0.1, 0)

def test_read_and_extract(self):
sf = SamFile()
Expand All @@ -30,3 +30,23 @@ def test_get_props(self):
m = entries[0].get_mapped_onto()

assert entries[0].get_length() == len(entries[0])

def test_get_binary_flag_0(self):
entry = SamEntry('b', 'b', 'b', 1, 2, 0.1, 256)
assert entry.get_binary_flag() == 256

def test_get_binary_flag_1(self):
sf = SamFile()
sf.read('aux_files/sam_single.sam')
entry = sf.get_entries()[0]
assert entry.get_binary_flag() == 256

def test_get_binary_flag_2(self):
sf = SamFile()
sf.read('aux_files/sam_test.sam')
entries = sf.get_entries()

assert len(entries) == 12

assert entries[0].get_binary_flag() == 0
assert entries[6].get_binary_flag() == 256
15 changes: 14 additions & 1 deletion tests/test_sam_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,4 +208,17 @@ def test_get_names(self):

names = sf.get_seq_names()
assert isinstance(names, set)
assert len(names) == 10
assert len(names) == 10

def test_read_bin_flag_filter(self):
sf = SamFile()
sf.set_primary_map_true()
sf.read("aux_files/sam_test.sam")
assert len(sf.get_entries()) == 11

def test_read_bin_flag_filter_2(self):
sf = SamFile()
sf.set_filter_options([0], [-1], ["CM042140.1"], 97)
sf.set_primary_map_true()
sf.read("aux_files/sam_test.sam")
assert len(sf.get_entries()) == 8

0 comments on commit 8e6b449

Please sign in to comment.