Skip to content

Commit

Permalink
mapped names
Browse files Browse the repository at this point in the history
  • Loading branch information
sodiumnitrate committed Oct 4, 2023
1 parent 8bf1057 commit 6dcea53
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 163 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ on:
push:
branches:
- master
paths-ignore:
- 'tests/aux_files/**'

jobs:
pre-commit:
Expand Down
78 changes: 0 additions & 78 deletions .github/workflows/wheels.yml

This file was deleted.

31 changes: 31 additions & 0 deletions src/genome_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,38 @@ std::vector<unsigned int> GenomeMap::get_heatmap(int start, int end){
std::vector<unsigned int> sub(first, last);
return sub;
}

void GenomeMap::set_mapped_entries(std::vector<SamEntry*> mapped_entries_){
mapped_entries = mapped_entries_;
}

std::vector<std::string> GenomeMap::get_mapped_read_names(int start, int end){
// start and end are *not* relative to vector indices. They are absolute nucleotide indices
std::vector<std::string> result;

if (start < heatmap_start || end > heatmap_end){
std::cout << "WARNING: requested range=(" << start <<','<<end<<") is outside the heatmap range=(" << heatmap_start << ',' << heatmap_end << ")." << std::endl;
return result;
}

// pick the mapped entries that overlap with the requested range
int start_idx, end_idx;
std::string read_name;
for (auto& t : mapped_entries){
start_idx = t->get_start_pos();
end_idx = t->get_end_pos();
read_name = t->get_read_name();
if (start_idx < end && end_idx > start){
result.push_back(read_name);
}
}

return result;
}

void GenomeMap::set_sample_name(std::string samp){sample_name = samp;}
std::string GenomeMap::get_sample_name(){return sample_name;}

// the following method sets heatmap -- to use from SamFile and not expose it to python
void GenomeMap::set_heatmap(std::vector<unsigned int> heatmap_, int heatmap_start_, int heatmap_end_){
heatmap = heatmap_;
Expand All @@ -56,5 +86,6 @@ void init_genome_map(py::module_ &m){
.def("set_sample_name", &GenomeMap::set_sample_name)
.def_property("sample_name", &GenomeMap::get_sample_name, &GenomeMap::set_sample_name)
.def("get_heatmap", &GenomeMap::get_heatmap)
.def("get_mapped_read_names", &GenomeMap::get_mapped_read_names)
;
}
5 changes: 5 additions & 0 deletions src/include/genome_map.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once
#include <string>
#include <vector>
#include "sam_entry.hpp"

// genome map, to be created and utilized by SamFile
// TODO: functionality to get names of reads that map to each pos?
Expand All @@ -11,6 +12,8 @@ class GenomeMap{
std::vector<unsigned int> heatmap;
int heatmap_start = 0;
int heatmap_end = -1;

std::vector<SamEntry*> mapped_entries;
public:
GenomeMap();
std::vector<unsigned int> get_heatmap(int start, int end);
Expand All @@ -20,4 +23,6 @@ class GenomeMap{
void set_sample_name(std::string samp);
void set_heatmap(std::vector<unsigned int> heatmap_, int heatmap_start_, int heatmap_end_);
void add_map(GenomeMap* new_gm);
std::vector<std::string> get_mapped_read_names(int start, int end);
void set_mapped_entries(std::vector<SamEntry*> mapped_entries_);
};
21 changes: 21 additions & 0 deletions src/include/sam_entry.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once
#include <string>
#include <vector>

class SamEntry{
std::string read_name;
std::string sequence;
std::string mapped_onto;
int start_pos;
int end_pos;

float alignment_score;
public:
SamEntry(std::string& rn, std::string& s, std::string& m, int st, int ed, float as);
std::string get_read_name();
std::string get_seq_str();
std::string get_mapped_onto();
int get_start_pos();
int get_end_pos();
int get_length();
};
12 changes: 4 additions & 8 deletions src/include/sam_file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,9 @@
#include "genome_map.hpp"
#include "sam_filter.hpp"
#include "seq_set.hpp"
#include "sam_entry.hpp"

class SamFile{
// path to the .sam file
std::string file_name;

// filter options
std::vector<int> start_indices = {0}; //for each name in mapped onto, give start and end
std::vector<int> end_indices = {-1};
Expand All @@ -27,7 +25,7 @@ class SamFile{
std::vector<std::string> headers;

// vector that will hold the read lines
std::vector<std::string> entries;
std::vector<SamEntry> entries;

// unordered_map to hold info about multimappers
std::unordered_map<std::string, std::vector<int> > unique_names_to_entry_idx;
Expand All @@ -42,18 +40,16 @@ class SamFile{
SamFile();
void set_filter_options(std::vector<int> start_, std::vector<int> end_, std::vector<std::string> mapped_onto_, float min_score_);
int size();
void set_file_name(std::string file_name_);
std::string get_file_name();
void set_normalized_true();
void set_normalized_false();
bool get_normalized();
int get_seq_start();
int get_seq_end();
void get_lengths_from_fasta(std::string fasta_file_name);
void read();
void read(std::string file_name);
GenomeMap get_genome_map(std::string mapped_name, std::string sample_name);
void add_sam_file(SamFile* other);
std::vector<std::string> get_entries();
std::vector<SamEntry> get_entries();
std::vector<std::string> get_headers();
bool are_filters_equal(SamFile* other);
void copy_filters_from_another(SamFile* other);
Expand Down
19 changes: 19 additions & 0 deletions src/sam_entry.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#include "include/sam_entry.hpp"

// constructor
SamEntry::SamEntry(std::string& rn, std::string& s, std::string& m, int st, int ed, float as)
{
read_name = rn;
sequence = s;
mapped_onto = m;
start_pos = st;
end_pos = ed;
alignment_score = as;
}

std::string SamEntry::get_read_name(){ return read_name; }
std::string SamEntry::get_seq_str(){ return sequence; }
std::string SamEntry::get_mapped_onto(){ return mapped_onto; }
int SamEntry::get_start_pos(){ return start_pos; }
int SamEntry::get_end_pos(){ return end_pos; }
int SamEntry::get_length(){ return end_pos - start_pos + 1; }
42 changes: 22 additions & 20 deletions src/sam_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,6 @@ int SamFile::size(){
return entries.size();
}

void SamFile::set_file_name(std::string file_name_) { file_name = file_name_;}

std::string SamFile::get_file_name(){return file_name;}

void SamFile::set_normalized_true(){normalized_score = true;}

void SamFile::set_normalized_false(){normalized_score = false;}
Expand All @@ -56,7 +52,7 @@ void SamFile::get_lengths_from_fasta(std::string fasta_file_name){
}

// read info from sam file
void SamFile::read(){
void SamFile::read(std::string file_name){
std::cout << min_score << std::endl;
// release GIL to be able to run this parallel from python side
py::gil_scoped_release release;
Expand Down Expand Up @@ -105,7 +101,7 @@ void SamFile::read(){
auto length = seq.length();

// apply filters----------------
if (!(filter_obj.query(ref_name, pos, pos + length))) continue;
if (!(filter_obj.query(ref_name, pos, pos + length - 1))) continue;
// -----------------------------

// check alignment score
Expand Down Expand Up @@ -133,7 +129,8 @@ void SamFile::read(){
idx++;

// passed all the tests
entries.push_back(line);
SamEntry entry(seq_name, seq, ref_name, pos, pos + length - 1, score);
entries.push_back(entry);
seq_start = fmin(seq_start, pos);
seq_end = fmax(seq_end, pos + length);
}
Expand Down Expand Up @@ -181,22 +178,31 @@ GenomeMap SamFile::get_genome_map(std::string mapped_name, std::string sample_na
std::fill(heatmap.begin(), heatmap.end(), 0);

std::string dummy, seq, ref_name, seq_name;
int pos;
int pos, end;
int multi;
for(auto t : entries){
std::istringstream ss(t);
ss >> seq_name >> dummy >> ref_name >> pos >> dummy >> dummy >> dummy >> dummy >> dummy >> seq;
bool overlap;
std::vector<SamEntry*> mapped;
for(auto& t : entries){
overlap = false;
ref_name = t.get_mapped_onto();
seq_name = t.get_seq_str();
pos = t.get_start_pos();
end = t.get_end_pos();
// skip if name doesn't match
if(ref_name.compare(mapped_name) != 0) continue;

if (multiplicity.empty()) multi = 1;
else multi = multiplicity[seq_name];

for(unsigned int i = pos - seq_start; i < pos + seq.length() - seq_start; i++){
for(unsigned int i = pos - seq_start; i < end - seq_start; i++){
heatmap[i] += multi;
overlap = true;
}

if (overlap) mapped.push_back(&t);
}
result.set_heatmap(heatmap, seq_start, seq_end);
result.set_mapped_entries(mapped);
return result;
}

Expand Down Expand Up @@ -251,7 +257,7 @@ void SamFile::add_sam_file(SamFile* other){
return;
}

std::vector<std::string> SamFile::get_entries(){return entries;}
std::vector<SamEntry> SamFile::get_entries(){return entries;}
std::vector<std::string> SamFile::get_headers(){return headers;}

void SamFile::generate_multimapping_stats(){
Expand All @@ -262,9 +268,8 @@ void SamFile::generate_multimapping_stats(){

std::string read_name;
int idx = 0;
for (const auto& t : entries){
std::istringstream ss(t);
ss >> read_name;
for (auto& t : entries){
read_name = t.get_read_name();
unique_names_to_entry_idx[read_name].push_back(idx);
idx++;
}
Expand All @@ -284,9 +289,6 @@ void init_sam_file(py::module_ &m){
[](SamFile &a){
return a.size();
})
.def("set_file_name", &SamFile::set_file_name)
.def("get_file_name", &SamFile::get_file_name)
.def_property("file_name", &SamFile::get_file_name, &SamFile::set_file_name)
.def("set_normalized_true", &SamFile::set_normalized_true)
.def("set_normalized_false", &SamFile::set_normalized_false)
.def("get_lengths_from_fasta", &SamFile::get_lengths_from_fasta)
Expand All @@ -295,7 +297,7 @@ void init_sam_file(py::module_ &m){
.def("get_seq_start", &SamFile::get_seq_start)
.def("get_seq_end", &SamFile::get_seq_end)
.def("get_genome_map", &SamFile::get_genome_map)
.def("get_entries", &SamFile::get_entries)
//.def("get_entries", &SamFile::get_entries)
.def("get_headers", &SamFile::get_headers)
.def("__repr__",
[](SamFile &a){
Expand Down
12 changes: 1 addition & 11 deletions src/sequence_analysis/sam_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,6 @@ class SamFile(SamFile_cpp):
"""
Python bindings for the SamFile class.
"""
def __init__(self, file_name=None):
"""
Overloads the C++ constructor.
"""
super(SamFile, self).__init__()
if file_name is not None:
if not isinstance(file_name, str):
raise TypeError
self.file_name = file_name

def read_multiple_files(self, file_names):
"""
Expand All @@ -39,7 +30,6 @@ def read_multiple_files(self, file_names):

def read_parallel(self, file_name):
sam = SamFile()
sam.file_name = file_name
sam.copy_filters_from_another(self)
sam.read()
sam.read(file_name)
return sam
Loading

0 comments on commit 6dcea53

Please sign in to comment.