Skip to content

Commit

Permalink
Merge pull request #85 from pvanheus/rewrite_count_constant_sites
Browse files Browse the repository at this point in the history
Rewrite how count_constant_sites is implemented, add test
  • Loading branch information
andrewjpage authored Oct 22, 2019
2 parents 52e9b68 + 5108681 commit 657e74d
Show file tree
Hide file tree
Showing 8 changed files with 63 additions and 23 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.5.0
2.5.1
14 changes: 7 additions & 7 deletions src/alignment-file.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,18 @@ void get_bases_for_each_snp(char filename[], char ** bases_for_snps)
gzclose(fp);
}

void detect_snps(char filename[], int pure_mode, int output_monomorphic, int output_constant_site_counts)
void detect_snps(char filename[], int pure_mode, int output_monomorphic) {
detect_snps_count_constant_sites(filename, pure_mode, output_monomorphic, NULL);
}

void detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int* constant_site_counts)
{
int i;
int l;
number_of_snps = 0;
number_of_samples = 0;
length_of_genome = 0;
char * first_sequence;
int base_counts[] = {0, 0, 0, 0};
/* array below allows quick mapping of A, C, T and G characters to indices in base_counts array */
const int char_to_base_count_index[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3};

Expand Down Expand Up @@ -195,15 +198,12 @@ void detect_snps(char filename[], int pure_mode, int output_monomorphic, int out
{
snp_locations[current_snp_index] = i;
current_snp_index++;
} else if (is_pure(first_sequence[i])) {
base_counts[char_to_base_count_index[(int) toupper(first_sequence[i])]]++;
} else if (constant_site_counts != NULL && is_pure(first_sequence[i])) {
constant_site_counts[char_to_base_count_index[(int) toupper(first_sequence[i])]]++;
}

}

if (output_constant_site_counts)
printf("%d,%d,%d,%d\n", base_counts[0], base_counts[1], base_counts[2], base_counts[3]);

free(first_sequence);
kseq_destroy(seq);
gzclose(fp);
Expand Down
3 changes: 2 additions & 1 deletion src/alignment-file.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@

#include "kseq.h"

void detect_snps( char filename[], int pure_mode, int output_monomorphic, int output_constant_site_counts);
void detect_snps( char filename[], int pure_mode, int output_monomorphic);
void detect_snps_count_constant_sites(char filename[], int pure_mode, int output_monomorphic, int *constant_site_counts);
void get_bases_for_each_snp(char filename[], char ** bases_for_snps);
int is_unknown(char base);
int get_length_of_genome();
Expand Down
3 changes: 1 addition & 2 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include <ctype.h>
#include <unistd.h>
#include <getopt.h>
#include "alignment-file.h"
#include "snp-sites.h"
#include "config.h"

Expand Down Expand Up @@ -126,7 +125,7 @@ int main (int argc, char **argv) {
strncpy(multi_fasta_filename, argv[optind], FILENAME_MAX);

if (output_constant_site_counts) {
detect_snps(multi_fasta_filename, pure_mode, output_monomorphic, output_constant_site_counts);
count_constant_sites(multi_fasta_filename, output_filename);
} else if( pure_mode || output_monomorphic)
{
generate_snp_sites_with_ref_pure_mono(multi_fasta_filename,
Expand Down
27 changes: 26 additions & 1 deletion src/snp-sites.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ static int generate_snp_sites_generic(char filename[],
int output_monomorphic)
{
int i;
detect_snps(filename, pure_mode, output_monomorphic, 0);
detect_snps(filename, pure_mode, output_monomorphic);

bases_for_snps = calloc(get_number_of_snps()+1, sizeof(char*));

Expand Down Expand Up @@ -141,7 +141,32 @@ int generate_snp_sites_with_ref_pure_mono(char filename[],
output_filename, output_reference, pure_mode, output_monomorphic);
}

void count_constant_sites(char multi_fasta_filename[], char output_filename[]) {
char cwd[100];
FILE *input_file;
FILE *output_file;
int *constant_site_counts = NULL;

output_file = (FILE *) fopen(output_filename, "w");
if (!output_file) {
fprintf(stderr, "ERROR: cannot open %s for writing: %s\n", output_filename, strerror(errno));
exit(EXIT_FAILURE);
}

constant_site_counts = (int *) calloc(4, sizeof(int));
if (constant_site_counts == NULL) {
fprintf(stderr, "ERROR: cannot allocated memory for constant_site_counts");
exit(EXIT_FAILURE);
}

detect_snps_count_constant_sites(multi_fasta_filename, 0, 0, constant_site_counts);


fprintf(output_file, "%d,%d,%d,%d\n", constant_site_counts[0], constant_site_counts[1],
constant_site_counts[2], constant_site_counts[3]);
fclose(output_file);
free(constant_site_counts);
}
// Inefficient
void strip_directory_from_filename(char * input_filename, char * output_filename)
{
Expand Down
7 changes: 6 additions & 1 deletion src/snp-sites.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
#define _SNP_SITES_H_

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>

int generate_snp_sites(char filename[],
int output_multi_fasta_file,
Expand All @@ -42,7 +45,9 @@ int generate_snp_sites_with_ref_pure_mono(char filename[],
int output_reference,
int pure_mode,
int output_monomorphic);


void count_constant_sites(char multi_fasta_filename[], char filename[]);

void strip_directory_from_filename(char *input_filename,
char *output_filename);

Expand Down
29 changes: 19 additions & 10 deletions tests/check-snp-sites.c
Original file line number Diff line number Diff line change
Expand Up @@ -183,64 +183,64 @@ END_TEST

START_TEST (valid_genome_length)
{
detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln",0,0,0);
detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln",0,0);
fail_unless( get_length_of_genome() == 2000 );
}
END_TEST

START_TEST (valid_genome_length_with_multiple_lines_per_sequence)
{
detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0,0);
detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0);
fail_unless( get_length_of_genome() == 2000 );
}
END_TEST

START_TEST (valid_number_of_sequences_in_file)
{
detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln",0,0,0);
detect_snps("../tests/data/alignment_file_one_line_per_sequence.aln",0,0);
fail_unless( get_number_of_samples() == 109 );
}
END_TEST

START_TEST (valid_number_of_sequences_in_file_with_multiple_lines_per_sequence)
{
detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0,0);
detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0);
fail_unless( get_number_of_samples() == 109 );
}
END_TEST

START_TEST (number_of_snps_detected)
{
detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0,0);
detect_snps("../tests/data/alignment_file_multiple_lines_per_sequence.aln",0,0);
fail_unless( get_number_of_snps() == 5);
}
END_TEST

START_TEST (number_of_snps_detected_small)
{
detect_snps("../tests/data/small_alignment.aln",0,0,0);
detect_snps("../tests/data/small_alignment.aln",0,0);
fail_unless( get_number_of_snps() == 1);
}
END_TEST

START_TEST (detect_snps_pure_mode)
{
detect_snps("../tests/data/pure_mode_alignment.aln",1,0,0);
detect_snps("../tests/data/pure_mode_alignment.aln",1,0);
fail_unless( get_number_of_snps() == 2);
}
END_TEST


START_TEST (detect_snps_pure_mode_monomorphic)
{
detect_snps("../tests/data/pure_mode_monomorphic_alignment.aln",1,1,0);
detect_snps("../tests/data/pure_mode_monomorphic_alignment.aln",1,1);
fail_unless( get_number_of_snps() == 3);
}
END_TEST

START_TEST (sample_names_from_alignment_file)
{
detect_snps("../tests/data/small_alignment.aln",0,0,0);
detect_snps("../tests/data/small_alignment.aln",0,0);
char ** current_sequence_names = get_sequence_names();

fail_unless(strcmp(current_sequence_names[0],"reference_sequence") == 0);
Expand Down Expand Up @@ -269,6 +269,14 @@ START_TEST (check_strip_directory_from_filename_with_directory)
}
END_TEST

START_TEST (check_count_constant_sites)
{
count_constant_sites("../tests/data/small_alignment.aln", "small_alignment.constant_site_counts.txt");
fail_unless(compare_files("../tests/data/small_alignment.constant_site_counts.txt", "small_alignment.constant_site_counts.txt"));
remove("small_alignment.constant_site_counts.txt");
}
END_TEST

Suite * snp_sites_suite (void)
{
Suite *s = suite_create ("Creating_SNP_Sites");
Expand Down Expand Up @@ -302,7 +310,8 @@ Suite * snp_sites_suite (void)
tcase_add_test (tc_snp_sites, valid_phylip_plus_reference);
tcase_add_test (tc_snp_sites, valid_alignment_with_pure_mode);
tcase_add_test (tc_snp_sites, valid_alignment_with_monomorphic_sites);

tcase_add_test (tc_snp_sites, check_count_constant_sites);

tcase_add_exit_test(tc_snp_sites, invalid_with_uneven_file_lengths,EXIT_FAILURE);
remove("uneven_alignment.aln.snp_sites.aln");
suite_add_tcase (s, tc_snp_sites);
Expand Down
1 change: 1 addition & 0 deletions tests/data/small_alignment.constant_site_counts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2,2,1,2

0 comments on commit 657e74d

Please sign in to comment.