Skip to content

Commit

Permalink
Move command line processing and help messages to function modules; a…
Browse files Browse the repository at this point in the history
…dd -seed to everything that needs it. Issue #3.
  • Loading branch information
brianwalenz committed Jul 15, 2022
1 parent 31141c1 commit ffb63f6
Show file tree
Hide file tree
Showing 24 changed files with 1,248 additions and 1,195 deletions.
156 changes: 135 additions & 21 deletions src/seqrequester/extract.C
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,138 @@

#include "seqrequester.H"

#include "arrays.H"
#include "sequence.H"
bool
extractParameters::parseOption(opMode &mode, int32 &arg, int32 argc, char **argv) {

if (strcmp(argv[arg], "extract") == 0) {
mode = modeExtract;
}

else if ((mode == modeExtract) && (strcmp(argv[arg], "-bases") == 0)) {
decodeRange(argv[++arg], baseBgn, baseEnd);
}

else if ((mode == modeExtract) && (strcmp(argv[arg], "-sequences") == 0)) {
seqsArgs.push_back(argv[++arg]);
}

extractParameters::extractParameters() {
}
else if ((mode == modeExtract) && (strcmp(argv[arg], "-reverse") == 0)) {
asReverse = true;
}

else if ((mode == modeExtract) && (strcmp(argv[arg], "-complement") == 0)) {
asComplement = true;
}

else if ((mode == modeExtract) && (strcmp(argv[arg], "-rc") == 0)) {
asReverse = true;
asComplement = true;
}

else if ((mode == modeExtract) && (strcmp(argv[arg], "-upper") == 0)) {
asUpperCase = true;
}

else if ((mode == modeExtract) && (strcmp(argv[arg], "-lower") == 0)) {
asLowerCase = true;
}

else if ((mode == modeExtract) && (strcmp(argv[arg], "-compress") == 0)) {
asCompressed = true;
}

else if ((mode == modeExtract) && (strcmp(argv[arg], "-length") == 0)) {
lensArgs.push_back(argv[++arg]);
}

else if ((mode == modeExtract) && (strcmp(argv[arg], "-lowermask") == 0)) {
doMasking = true;
maskWithN = false;
}

extractParameters::~extractParameters() {
for (uint64 ii=0; ii<seqsName.size(); ii++)
delete [] seqsName[ii];
else if ((mode == modeExtract) && (strcmp(argv[arg], "-nmask") == 0)) {
doMasking = true;
maskWithN = true;
}

else if ((mode == modeExtract) && (strcmp(argv[arg], "-fasta") == 0)) {
outputFASTA = true;
}
else if ((mode == modeExtract) && (strcmp(argv[arg], "-fastq") == 0)) {
outputFASTQ = true;

if ((arg+1 < argc) && ('0' <= argv[arg+1][0]) && (argv[arg+1][0] <= '9'))
outputQV = strtouint32(argv[++arg]);
}

else {
return(false);
}

return(true);
}



void
extractParameters::finalize(void) {
extractParameters::showUsage(opMode mode) {

if (mode != modeExtract)
return;

fprintf(stderr, "OPTIONS for extract mode:\n");
fprintf(stderr, " -bases baselist extract bases specified in the 'baselist' from\n");
fprintf(stderr, " each sequence\n");
fprintf(stderr, " -sequences seqlist extract ordinal sequences specified in the 'seqlist'\n");
fprintf(stderr, " -sequences namefile extract sequences with names listed in 'namefile'\n");
fprintf(stderr, "\n");
fprintf(stderr, " -reverse reverse the bases in the sequence\n");
fprintf(stderr, " -complement complement the bases in the sequence\n");
fprintf(stderr, " -rc alias for -reverse -complement\n");
fprintf(stderr, "\n");
fprintf(stderr, " -compress compress homopolymer runs to one base\n");
fprintf(stderr, "\n");
fprintf(stderr, " -upcase\n");
fprintf(stderr, " -downcase\n");
fprintf(stderr, "\n");
fprintf(stderr, " -fasta write output as FASTA\n");
fprintf(stderr, " -fastq [q] write output as FASTQ; if no quality values, use q\n");
fprintf(stderr, " (integer, 0-based) for all bases; default q=20\n");
fprintf(stderr, "\n");
fprintf(stderr, " -length min-max print sequence if it is at least 'min' bases and at\n");
fprintf(stderr, " most 'max' bases long\n");
fprintf(stderr, "\n");
fprintf(stderr, " a 'baselist' is a set of integers formed from any combination\n");
fprintf(stderr, " of the following, seperated by a comma:\n");
fprintf(stderr, " num a single number\n");
fprintf(stderr, " bgn-end a range of numbers: bgn <= end\n");
fprintf(stderr, " bases are spaced-based; -bases 0-2,4 will print the bases between\n");
fprintf(stderr, " the first two spaces (the first two bases) and the base after the\n");
fprintf(stderr, " fourth space (the fifth base).\n");
fprintf(stderr, " \n");
fprintf(stderr, " a 'seqlist' is a set of integers formed from any combination\n");
fprintf(stderr, " of the following, seperated by a comma:\n");
fprintf(stderr, " num a single number\n");
fprintf(stderr, " bgn-end a range of numbers: bgn <= end\n");
fprintf(stderr, " sequences are 1-based; -sequences 1,3-5 will print the first, third,\n");
fprintf(stderr, " fourth and fifth sequences.\n");
fprintf(stderr, " \n");
fprintf(stderr, " a 'namefile' contains the names of sequences to extract, one per line,\n");
fprintf(stderr, " without any leading '>' or '@'. a name is the first word on the ident line.\n");
fprintf(stderr, " NOTA BENE: at most one namefile cn be supplied.\n");
fprintf(stderr, " \n");
}



bool
extractParameters::checkOptions(opMode mode, vector<char const *> &inputs, vector<char const *> &errors) {

if (mode != modeExtract)
return(false);

if (inputs.size() == 0)
sprintf(errors, "ERROR: No input sequence files supplied.\n");

// Decode base ranges.

Expand All @@ -45,7 +160,6 @@ extractParameters::finalize(void) {
// Decode sequence ranges.

for (auto arg : seqsArgs) {
fprintf(stderr, "Parse '%s'\n", arg);
if (fileExists(arg) == true)
seqsName.load(arg, splitWords);
else
Expand Down Expand Up @@ -84,10 +198,8 @@ extractParameters::finalize(void) {
// To us, sequences begin at zero.

for (uint32 si=0; si<seqsBgn.size(); si++) {
if (seqsBgn[si] == 0) {
fprintf(stderr, "ERROR: sequences begin at 1, not zero.\n");
exit(1);
}
if (seqsBgn[si] == 0)
sprintf(errors, "ERROR: sequences begin at 1, not zero.\n");

seqsBgn[si] -= 1;
}
Expand All @@ -100,11 +212,9 @@ extractParameters::finalize(void) {
if (baseBgn[bi] == baseEnd[bi])
baseEnd[bi] += 1;

if (baseEnd[bi] <= baseBgn[bi]) {
fprintf(stderr, "ERROR: base range %lu-%lu is invalid, must be increasing.\n",
if (baseEnd[bi] <= baseBgn[bi])
sprintf(errors, "ERROR: base range %lu-%lu is invalid, must be increasing.\n",
baseBgn[bi], baseEnd[bi]);
exit(1);
}
}

// Allocate a big string, but since we don't know the actual max length,
Expand All @@ -126,6 +236,8 @@ extractParameters::finalize(void) {
C['C'] = 'G'; U['C'] = 'C'; L['C'] = 'c';
C['G'] = 'C'; U['G'] = 'G'; L['G'] = 'g';
C['T'] = 'A'; U['T'] = 'T'; L['T'] = 't';

return(errors.size() > 0);
}


Expand Down Expand Up @@ -277,8 +389,8 @@ printSequence(dnaSeq &seq,


void
doExtract(vector<char *> &inputs,
extractParameters &extPar) {
doExtract(vector<char const *> &inputs,
extractParameters &extPar) {
dnaSeq seq;

for (uint32 fi=0; fi<inputs.size(); fi++) {
Expand All @@ -287,7 +399,7 @@ doExtract(vector<char *> &inputs,
// If desired names exist or the file is compressed, load every sequence
// and output a sequence if is desired.
//
if ((extPar.seqsName.size() > 0) || (sf->isCompressed() == true)) {
if ((extPar.seqsName.size() > 0) || (sf->isIndexable() == false)) {
while (sf->loadSequence(seq) == true)
if (isDesired(sf->seqIdx(), seq.ident(), seq.length(), extPar))
printSequence(seq, sf, extPar);
Expand All @@ -299,12 +411,14 @@ doExtract(vector<char *> &inputs,
// reported as errors.
//
else {
sf->generateIndex();

for (uint32 si=0; si<extPar.seqsBgn.size(); si++) {
uint64 sbgn = min(extPar.seqsBgn[si], sf->numberOfSequences());
uint64 send = min(extPar.seqsEnd[si], sf->numberOfSequences());

for (uint64 ss=sbgn; ss<send; ss++)
if (isDesired(sf->seqIdx(), nullptr, sf->sequenceLength(ss), extPar) &&
if (isDesired(ss, nullptr, sf->sequenceLength(ss), extPar) &&
sf->findSequence(ss) &&
sf->loadSequence(seq))
printSequence(seq, sf, extPar);
Expand Down
63 changes: 33 additions & 30 deletions src/seqrequester/extract.H
Original file line number Diff line number Diff line change
Expand Up @@ -24,52 +24,55 @@

class extractParameters {
public:
extractParameters();
~extractParameters();
extractParameters() {}
~extractParameters() {}

void finalize(void);
bool parseOption(opMode &mode, int32 &arg, int32 argc, char **argv);
void showUsage(opMode mode);

bool checkOptions(opMode mode, std::vector<char const *> &inputs, std::vector<char const *> &errors);

private:
void loadNames(void);

public:
vector<char const *> baseArgs; // Base ranges to print
vector<uint64> baseBgn; //
vector<uint64> baseEnd; //
std::vector<char const *> baseArgs; // Base ranges to print
std::vector<uint64> baseBgn; //
std::vector<uint64> baseEnd; //

vector<char const *> seqsArgs; // Sequence ranges to print
vector<uint64> seqsBgn; //
vector<uint64> seqsEnd; //
stringList seqsName; //
std::vector<char const *> seqsArgs; // Sequence ranges to print
std::vector<uint64> seqsBgn; //
std::vector<uint64> seqsEnd; //
stringList seqsName; //

vector<char const *> lensArgs;
vector<uint64> lensMin; // Length ranges to print
vector<uint64> lensMax; //
std::vector<char const *> lensArgs;
std::vector<uint64> lensMin; // Length ranges to print
std::vector<uint64> lensMax; //

bool asReverse = false;
bool asComplement = false;
bool asReverse = false;
bool asComplement = false;

bool asUpperCase = false;
bool asLowerCase = false;
bool asUpperCase = false;
bool asLowerCase = false;

bool asCompressed = false;
bool asCompressed = false;

bool doMasking = false; // Mask out any base not in baseBgn/baseEnd with 'N'
bool doMasking = false; // Mask out any base not in baseBgn/baseEnd with 'N'

bool maskWithN = true; // Mask with lowercase sequence instead of 'N'
bool maskWithN = true; // Mask with lowercase sequence instead of 'N'

bool outputFASTA = false;
bool outputFASTQ = false;
uint8 outputQV = 20;
bool outputFASTA = false;
bool outputFASTQ = false;
uint8 outputQV = 20;

// Data for doing the extraction.

char C[256] = {0}; // Complement a base
char U[256] = {0}; // Uppercase a base
char L[256] = {0}; // Lowercase a base
char C[256] = {0}; // Complement a base
char U[256] = {0}; // Uppercase a base
char L[256] = {0}; // Lowercase a base

uint64 outputBasesLen = 0;
uint64 outputBasesMax = 0; //1048576;
char *outputBases = nullptr; //new char [outputBasesMax];
uint8 *outputQuals = nullptr; //new char [outputBasesMax];
uint64 outputBasesLen = 0;
uint64 outputBasesMax = 0; //1048576;
char *outputBases = nullptr; //new char [outputBasesMax];
uint8 *outputQuals = nullptr; //new char [outputBasesMax];
};
Loading

0 comments on commit ffb63f6

Please sign in to comment.