Skip to content

Commit

Permalink
allow to exclude MSA sites from the analysis (#189)
Browse files Browse the repository at this point in the history
1. Partition file now supports comment lines such that individual partitions can be easily excluded, e.g.

```
 LG+G, p1 = 1-20, 81-113
 #disable partition p2
 #WAG+G, p2 = 21-50
 JTT, p3 = 51-70
```

2. To prevent excluding MSA sites by mistake, you need to add `--force msa_extra_cols` to explicitely disable partition completeness check
  • Loading branch information
amkozlov committed Jan 17, 2025
1 parent 8ad1683 commit 9c40cd4
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 7 deletions.
9 changes: 4 additions & 5 deletions src/PartitionedMSA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ PartitionedMSA& PartitionedMSA::operator=(PartitionedMSA&& other)
_full_msa = std::move(other._full_msa);
_taxon_names = std::move(other._taxon_names);
_taxon_id_map = std::move(other._taxon_id_map);
_unassigned_sites = std::move(other._unassigned_sites);
_subst_linkage = std::move(other._subst_linkage);
_freqs_linkage = std::move(other._freqs_linkage);
return *this;
Expand Down Expand Up @@ -64,17 +65,15 @@ uintVector PartitionedMSA::get_site_part_assignment() const
p++;
}

assert(p == part_count());

/* check if all sites were assigned to partitions */
MissingPartitionForSiteException e_unassinged;
for (size_t i = 0; i < full_len; ++i)
{
if (!spa[i])
e_unassinged.add_unassigned_site(i+1);
_unassigned_sites.push_back(i+1);
}

if (e_unassinged.count() > 0)
throw e_unassinged;

return spa;
}

Expand Down
3 changes: 3 additions & 0 deletions src/PartitionedMSA.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class PartitionedMSA
std::vector<PartitionInfo>& part_list() { return _part_list; };
const NameList& taxon_names() const { return _taxon_names; };
const NameIdMap& taxon_id_map() const { return _taxon_id_map; }
const IDVector& unassigned_sites() const { return _unassigned_sites; };

size_t full_msa_site(size_t index, size_t site) const;
const uintVector& site_part_map() const;
Expand All @@ -41,6 +42,7 @@ class PartitionedMSA
/* given in elements (NOT in bytes) */
size_t taxon_clv_size() const;


// setters
void full_msa(MSA&& msa);
void part_msa(size_t index, MSA&& msa) { _part_list.at(index).msa(std::move(msa)); };
Expand Down Expand Up @@ -82,6 +84,7 @@ class PartitionedMSA
NameList _taxon_names;
NameIdMap _taxon_id_map;
mutable uintVector _site_part_map;
mutable IDVector _unassigned_sites;
double _difficulty_score;
uintVector _subst_linkage;
uintVector _freqs_linkage;
Expand Down
5 changes: 5 additions & 0 deletions src/io/part_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ RaxmlPartitionStream& operator>>(RaxmlPartitionStream& stream, PartitionInfo& pa
case '\t':
/* ignore whitespace */
break;
case '#':
/* comment: ignore line */
while (stream.peek() != '\n' && stream.peek() != EOF)
stream.get();
break;
case '\r':
/* handle Windows line breaks (\r\n) */
if (stream.peek() == '\n')
Expand Down
31 changes: 31 additions & 0 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,37 @@ bool check_msa(RaxmlInstance& instance)
msa_valid &= parted_msa_view.taxon_name_map().empty();
}

if (!parted_msa.unassigned_sites().empty())
{
auto extra_sites = parted_msa.unassigned_sites();
if (opts.safety_checks.isset(SafetyCheck::msa_extra_cols))
{
LOG_ERROR << "\nERROR: Found " << extra_sites.size() <<
" alignment site(s) which are not assigned to any partition:" << endl;

for (size_t i = 0; i < extra_sites.size(); ++i)
{
LOG_ERROR << extra_sites[i] << " ";
if (i >= 1000)
{
LOG_ERROR << "...";
break;
}
}

LOG_ERROR << "\nNOTE: If you want to ignore these sites, "
"please add '--force msa_extra_cols' option." << endl;

msa_valid = false;
}
else
{
LOG_WARN << "\nWARNING: Found " << extra_sites.size() <<
" alignment site(s) which are not assigned to any partition."
" They will be ignored." << endl;
}
}

/* check for duplicate sequences */
if (opts.safety_checks.isset(SafetyCheck::msa_dups))
{
Expand Down
2 changes: 2 additions & 0 deletions src/util/SafetyCheck.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ SafetyCheck::Flags SafetyCheck::from_string(const string& s)
return SafetyCheck::msa_dups;
else if (s == "msa_allgaps")
return SafetyCheck::msa_allgaps;
else if (s == "msa_extra_cols")
return SafetyCheck::msa_extra_cols;
else if (s == "model")
return SafetyCheck::model;
else if (s == "model_zero_freqs")
Expand Down
3 changes: 2 additions & 1 deletion src/util/SafetyCheck.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ struct SafetyCheck
msa_names = 1 << 10,
msa_dups = 1 << 11,
msa_allgaps = 1 << 12,
msa = msa_names | msa_dups | msa_allgaps,
msa_extra_cols = 1 << 13,
msa = msa_names | msa_dups | msa_allgaps | msa_extra_cols,
model_zero_freqs = 1 << 21,
model_invalid_freqs = 1 << 22,
model_lg4_freqs = 1 << 23,
Expand Down
2 changes: 1 addition & 1 deletion src/version.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#define RAXML_VERSION "2.0.0-dev"
#define RAXML_DATE "15.01.2025"
#define RAXML_DATE "17.01.2025"
#define RAXML_INTVER 200

0 comments on commit 9c40cd4

Please sign in to comment.