From ffb63f684eaa5f2cc44cbf8bb6f1ba4b3d414743 Mon Sep 17 00:00:00 2001 From: "Brian P. Walenz" Date: Thu, 14 Jul 2022 23:17:48 -0400 Subject: [PATCH] Move command line processing and help messages to function modules; add -seed to everything that needs it. Issue #3. --- src/seqrequester/extract.C | 156 +++- src/seqrequester/extract.H | 63 +- src/seqrequester/generate.C | 177 +++- src/seqrequester/generate.H | 117 +-- src/seqrequester/microsatellite.C | 85 +- src/seqrequester/microsatellite.H | 23 +- src/seqrequester/mutate.C | 101 ++- src/seqrequester/mutate.H | 35 +- src/seqrequester/partition.C | 121 ++- src/seqrequester/partition.H | 63 +- src/seqrequester/sample.C | 153 +++- src/seqrequester/sample.H | 65 +- src/seqrequester/seqrequester.C | 839 ++----------------- src/seqrequester/seqrequester.H | 35 +- src/seqrequester/shiftregister-emit-fast.C | 2 +- src/seqrequester/shiftregister-search-fast.C | 2 +- src/seqrequester/shiftregister-search-slow.C | 2 +- src/seqrequester/shiftregister.C | 92 +- src/seqrequester/shiftregister.H | 19 +- src/seqrequester/simulate.C | 163 +++- src/seqrequester/simulate.H | 25 +- src/seqrequester/summarize.C | 92 +- src/seqrequester/summarize.H | 11 +- src/utility | 2 +- 24 files changed, 1248 insertions(+), 1195 deletions(-) diff --git a/src/seqrequester/extract.C b/src/seqrequester/extract.C index 674c6a0..54b6a88 100644 --- a/src/seqrequester/extract.C +++ b/src/seqrequester/extract.C @@ -19,23 +19,138 @@ #include "seqrequester.H" -#include "arrays.H" -#include "sequence.H" +bool +extractParameters::parseOption(opMode &mode, int32 &arg, int32 argc, char **argv) { + + if (strcmp(argv[arg], "extract") == 0) { + mode = modeExtract; + } + else if ((mode == modeExtract) && (strcmp(argv[arg], "-bases") == 0)) { + decodeRange(argv[++arg], baseBgn, baseEnd); + } + else if ((mode == modeExtract) && (strcmp(argv[arg], "-sequences") == 0)) { + seqsArgs.push_back(argv[++arg]); + } -extractParameters::extractParameters() { -} + else if ((mode == modeExtract) && (strcmp(argv[arg], "-reverse") == 0)) { + asReverse = true; + } + + else if ((mode == modeExtract) && (strcmp(argv[arg], "-complement") == 0)) { + asComplement = true; + } + + else if ((mode == modeExtract) && (strcmp(argv[arg], "-rc") == 0)) { + asReverse = true; + asComplement = true; + } + + else if ((mode == modeExtract) && (strcmp(argv[arg], "-upper") == 0)) { + asUpperCase = true; + } + + else if ((mode == modeExtract) && (strcmp(argv[arg], "-lower") == 0)) { + asLowerCase = true; + } + + else if ((mode == modeExtract) && (strcmp(argv[arg], "-compress") == 0)) { + asCompressed = true; + } + + else if ((mode == modeExtract) && (strcmp(argv[arg], "-length") == 0)) { + lensArgs.push_back(argv[++arg]); + } + + else if ((mode == modeExtract) && (strcmp(argv[arg], "-lowermask") == 0)) { + doMasking = true; + maskWithN = false; + } -extractParameters::~extractParameters() { - for (uint64 ii=0; ii' or '@'. a name is the first word on the ident line.\n"); + fprintf(stderr, " NOTA BENE: at most one namefile cn be supplied.\n"); + fprintf(stderr, " \n"); +} + + + +bool +extractParameters::checkOptions(opMode mode, vector &inputs, vector &errors) { + + if (mode != modeExtract) + return(false); + + if (inputs.size() == 0) + sprintf(errors, "ERROR: No input sequence files supplied.\n"); // Decode base ranges. @@ -45,7 +160,6 @@ extractParameters::finalize(void) { // Decode sequence ranges. for (auto arg : seqsArgs) { - fprintf(stderr, "Parse '%s'\n", arg); if (fileExists(arg) == true) seqsName.load(arg, splitWords); else @@ -84,10 +198,8 @@ extractParameters::finalize(void) { // To us, sequences begin at zero. for (uint32 si=0; si 0); } @@ -277,8 +389,8 @@ printSequence(dnaSeq &seq, void -doExtract(vector &inputs, - extractParameters &extPar) { +doExtract(vector &inputs, + extractParameters &extPar) { dnaSeq seq; for (uint32 fi=0; fi &inputs, // If desired names exist or the file is compressed, load every sequence // and output a sequence if is desired. // - if ((extPar.seqsName.size() > 0) || (sf->isCompressed() == true)) { + if ((extPar.seqsName.size() > 0) || (sf->isIndexable() == false)) { while (sf->loadSequence(seq) == true) if (isDesired(sf->seqIdx(), seq.ident(), seq.length(), extPar)) printSequence(seq, sf, extPar); @@ -299,12 +411,14 @@ doExtract(vector &inputs, // reported as errors. // else { + sf->generateIndex(); + for (uint32 si=0; sinumberOfSequences()); uint64 send = min(extPar.seqsEnd[si], sf->numberOfSequences()); for (uint64 ss=sbgn; ssseqIdx(), nullptr, sf->sequenceLength(ss), extPar) && + if (isDesired(ss, nullptr, sf->sequenceLength(ss), extPar) && sf->findSequence(ss) && sf->loadSequence(seq)) printSequence(seq, sf, extPar); diff --git a/src/seqrequester/extract.H b/src/seqrequester/extract.H index 9860221..f433614 100644 --- a/src/seqrequester/extract.H +++ b/src/seqrequester/extract.H @@ -24,52 +24,55 @@ class extractParameters { public: - extractParameters(); - ~extractParameters(); + extractParameters() {} + ~extractParameters() {} - void finalize(void); + bool parseOption(opMode &mode, int32 &arg, int32 argc, char **argv); + void showUsage(opMode mode); + + bool checkOptions(opMode mode, std::vector &inputs, std::vector &errors); private: void loadNames(void); public: - vector baseArgs; // Base ranges to print - vector baseBgn; // - vector baseEnd; // + std::vector baseArgs; // Base ranges to print + std::vector baseBgn; // + std::vector baseEnd; // - vector seqsArgs; // Sequence ranges to print - vector seqsBgn; // - vector seqsEnd; // - stringList seqsName; // + std::vector seqsArgs; // Sequence ranges to print + std::vector seqsBgn; // + std::vector seqsEnd; // + stringList seqsName; // - vector lensArgs; - vector lensMin; // Length ranges to print - vector lensMax; // + std::vector lensArgs; + std::vector lensMin; // Length ranges to print + std::vector lensMax; // - bool asReverse = false; - bool asComplement = false; + bool asReverse = false; + bool asComplement = false; - bool asUpperCase = false; - bool asLowerCase = false; + bool asUpperCase = false; + bool asLowerCase = false; - bool asCompressed = false; + bool asCompressed = false; - bool doMasking = false; // Mask out any base not in baseBgn/baseEnd with 'N' + bool doMasking = false; // Mask out any base not in baseBgn/baseEnd with 'N' - bool maskWithN = true; // Mask with lowercase sequence instead of 'N' + bool maskWithN = true; // Mask with lowercase sequence instead of 'N' - bool outputFASTA = false; - bool outputFASTQ = false; - uint8 outputQV = 20; + bool outputFASTA = false; + bool outputFASTQ = false; + uint8 outputQV = 20; // Data for doing the extraction. - char C[256] = {0}; // Complement a base - char U[256] = {0}; // Uppercase a base - char L[256] = {0}; // Lowercase a base + char C[256] = {0}; // Complement a base + char U[256] = {0}; // Uppercase a base + char L[256] = {0}; // Lowercase a base - uint64 outputBasesLen = 0; - uint64 outputBasesMax = 0; //1048576; - char *outputBases = nullptr; //new char [outputBasesMax]; - uint8 *outputQuals = nullptr; //new char [outputBasesMax]; + uint64 outputBasesLen = 0; + uint64 outputBasesMax = 0; //1048576; + char *outputBases = nullptr; //new char [outputBasesMax]; + uint8 *outputQuals = nullptr; //new char [outputBasesMax]; }; diff --git a/src/seqrequester/generate.C b/src/seqrequester/generate.C index 36e9541..bd110a2 100644 --- a/src/seqrequester/generate.C +++ b/src/seqrequester/generate.C @@ -19,16 +19,160 @@ #include "seqrequester.H" -#include "arrays.H" -#include "files.H" -#include "mt19937ar.H" +bool +generateParameters::parseOption(opMode &mode, int32 &arg, int32 argc, char **argv) { + + if (strcmp(argv[arg], "generate") == 0) { + mode = modeGenerate; + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-ident") == 0)) { + ident = argv[++arg]; + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-min") == 0)) { + minLength = strtouint64(argv[++arg]); + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-max") == 0)) { + maxLength = strtouint64(argv[++arg]); + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-sequences") == 0)) { + nSeqs = strtouint64(argv[++arg]); + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-bases") == 0)) { + nBases = strtouint64(argv[++arg]); + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-seed") == 0)) { + mt.mtSetSeed(strtouint32(argv[++arg])); + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-guassian") == 0)) { + useGaussian = true; + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-mirror") == 0)) { + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-gc") == 0)) { + double gc = strtodouble(argv[++arg]); + double at = 1.0 - gc; + + gFreq = cFreq = gc / 2.0; + aFreq = tFreq = at / 2.0; + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-at") == 0)) { + double at = strtodouble(argv[++arg]); + double gc = 1.0 - at; + + gFreq = cFreq = gc / 2.0; + aFreq = tFreq = at / 2.0; + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-a") == 0) ){ + aFreq = strtodouble(argv[++arg]); + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-c") == 0)) { + cFreq = strtodouble(argv[++arg]); + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-g") == 0)) { + gFreq = strtodouble(argv[++arg]); + } + + else if ((mode == modeGenerate) && (strcmp(argv[arg], "-t") == 0)) { + tFreq = strtodouble(argv[++arg]); + } + + else { + return(false); + } + + return(true); +} void -doGenerate(generateParameters &genPar) { - mtRandom MT; +generateParameters::showUsage(opMode mode) { + + if (mode != modeGenerate) + return; + + fprintf(stderr, "OPTIONS for generate mode:\n"); + fprintf(stderr, " -ident S name reads using printf() pattern in S; default 'random%%08lu'\n"); + fprintf(stderr, " -min M minimum sequence length\n"); + fprintf(stderr, " -max M maximum sequence length\n"); + fprintf(stderr, " -sequences N generate N sequences\n"); + fprintf(stderr, " -bases B generate at least B bases, no more than B+maxLength-1 bases.\n"); + fprintf(stderr, " -gaussian 99.73%% of the reads (3 standard deviations) will be between min and max\n"); + fprintf(stderr, " -mirror F (not implemented; mirror the length distrib of F)\n"); + fprintf(stderr, " -gc bias sets GC/AT composition (default 0.50)\n"); + fprintf(stderr, " -at bias sets GC/AT composition (default 0.50)\n"); + fprintf(stderr, " -a freq sets frequency of A bases (default 0.25)\n"); + fprintf(stderr, " -c freq sets frequency of C bases (default 0.25)\n"); + fprintf(stderr, " -g freq sets frequency of G bases (default 0.25)\n"); + fprintf(stderr, " -t freq sets frequency of T bases (default 0.25)\n"); + fprintf(stderr, " -seed s seed the pseudo-random number generate with integer 's'\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "The -gc option is a shortcut for setting all four base frequencies at once. Order matters!\n"); + fprintf(stderr, " -gc 0.6 -a 0.1 -t 0.3 -- sets G = C = 0.3, A = 0.1, T = 0.3\n"); + fprintf(stderr, " -a 0.1 -t 0.3 -gc 0.6 -- sets G = C = 0.3, A = T = 0.15\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Base frequencies are scaled to sum to 1.0.\n"); + fprintf(stderr, " -a 1.25 -- results in a sum of 2.0 (1.25 + 0.25 + 0.25 + 0.25) so final frequencies will be:\n"); + fprintf(stderr, " A = 1.25/2 = 0.625\n"); + fprintf(stderr, " C = G = T = 0.25/2 = 0.125.\n"); + fprintf(stderr, " -gc 0.8 -a 1.0 -t 0.2 -- sum is also 2.0, final frequencies will be:\n"); + fprintf(stderr, " A = 1.00/2 = 0.5\n"); + fprintf(stderr, " C = G = 0.40/2 = 0.2\n"); + fprintf(stderr, " T = 0.20/2 = 0.1\n"); + fprintf(stderr, "\n"); +} + + + +bool +generateParameters::checkOptions(opMode mode, vector &inputs, vector &errors) { + + if (mode != modeGenerate) + return(false); + + if (inputs.size() > 0) + sprintf(errors, "ERROR: generate mode does not work with input sequence files.\n"); + + // Check for invalid. If not set up, just return. + + if ((nSeqs == 0) && (nBases == 0)) + sprintf(errors, "ERROR: at least one of -sequences and -bases must be supplied.\n"); + + if (minLength > maxLength) + sprintf(errors, "ERROR: Told to generate sequences with min length larger than max length.\n"); + // Unlimit any unset limit. + + if (nSeqs == 0) nSeqs = UINT64_MAX; + if (nBases == 0) nBases = UINT64_MAX; + + // Set Gaussian mean and standard deviation + + gMean = (minLength + maxLength) / 2.0; + gStdDev = (maxLength - minLength) / 6.0; + + // Load lengths to mirror. + + return(errors.size() > 0); +} + + + +void +doGenerate(generateParameters &genPar) { uint64 nSeqs = 0; uint64 nBases = 0; @@ -42,18 +186,31 @@ doGenerate(generateParameters &genPar) { double Gthresh = genPar.aFreq + genPar.cFreq + genPar.gFreq; double Tthresh = genPar.aFreq + genPar.cFreq + genPar.gFreq + genPar.tFreq; - if (genPar.randomSeedValid) - MT.mtSetSeed(genPar.randomSeed); + fprintf(stderr, "Generating up to " F_U64 " sequences and up to " F_U64 " bases.\n", nSeqs, nBases); + + double fSum = genPar.aFreq + genPar.cFreq + genPar.gFreq + genPar.tFreq; + + fprintf(stderr, "Using base frequencies:\n"); + fprintf(stderr, " A = %7.5f / %7.5f = %7.5f\n", genPar.aFreq, fSum, genPar.aFreq / fSum); + fprintf(stderr, " C = %7.5f / %7.5f = %7.5f\n", genPar.cFreq, fSum, genPar.cFreq / fSum); + fprintf(stderr, " G = %7.5f / %7.5f = %7.5f\n", genPar.gFreq, fSum, genPar.gFreq / fSum); + fprintf(stderr, " T = %7.5f / %7.5f = %7.5f\n", genPar.tFreq, fSum, genPar.tFreq / fSum); + + genPar.aFreq /= fSum; + genPar.cFreq /= fSum; + genPar.gFreq /= fSum; + genPar.tFreq /= fSum; + if (genPar.ident == nullptr) genPar.ident = duplicateString("random%08lu"); while ((nSeqs < genPar.nSeqs) && (nBases < genPar.nBases)) { - double len = MT.mtRandomGaussian(genPar.gMean, genPar.gStdDev); + double len = genPar.mt.mtRandomGaussian(genPar.gMean, genPar.gStdDev); while (len < -0.5) - len = MT.mtRandomGaussian(genPar.gMean, genPar.gStdDev); + len = genPar.mt.mtRandomGaussian(genPar.gMean, genPar.gStdDev); seqLen = (uint64)round(len); @@ -61,7 +218,7 @@ doGenerate(generateParameters &genPar) { resizeArrayPair(seq, qlt, 0, seqMax, seqLen+1); for (uint64 ii=0; ii &inputs, std::vector &errors); - randomSeedValid = false; - randomSeed = 0; - - useGaussian = true; - gMean = 0; - gStdDev = 0; - - useMirror = false; - mirrorInput = NULL; - mirrorDistribution = 0.0; - mirrorDistributionLen = 0; - mirrorDistributionMax = 0; - mirrorDistributionSum = 0; - - aFreq = 0.25; - cFreq = 0.25; - gFreq = 0.25; - tFreq = 0.25; - }; - - ~generateParameters() { - }; - - - void finalize(void) { - - // Check for invalid. If not set up, just return. - - if ((nSeqs == 0) && (nBases == 0)) - return; - - if (minLength > maxLength) - fprintf(stderr, "ERROR: Told to generate sequences with min length larger than max length.\n"), exit(1); - - // Unlimit any unset limit. - - fprintf(stderr, "Generating up to " F_U64 " sequences and up to " F_U64 " bases.\n", nSeqs, nBases); - - if (nSeqs == 0) nSeqs = UINT64_MAX; - if (nBases == 0) nBases = UINT64_MAX; - - // Set Gaussian mean and standard deviation - - gMean = (minLength + maxLength) / 2.0; - gStdDev = (maxLength - minLength) / 6.0; - - // Load lengths to mirror. - - // Set base frequencies. - - double fSum = aFreq + cFreq + gFreq + tFreq; - - fprintf(stderr, "Using base frequencies:\n"); - fprintf(stderr, " A = %7.5f / %7.5f = %7.5f\n", aFreq, fSum, aFreq / fSum); - fprintf(stderr, " C = %7.5f / %7.5f = %7.5f\n", cFreq, fSum, cFreq / fSum); - fprintf(stderr, " G = %7.5f / %7.5f = %7.5f\n", gFreq, fSum, gFreq / fSum); - fprintf(stderr, " T = %7.5f / %7.5f = %7.5f\n", tFreq, fSum, tFreq / fSum); - - aFreq /= fSum; - cFreq /= fSum; - gFreq /= fSum; - tFreq /= fSum; - }; - - char const *ident; - - uint64 minLength; - uint64 maxLength; +public: + char const *ident = nullptr; - uint64 nSeqs; - uint64 nBases; + uint64 minLength = 0; + uint64 maxLength = 10000; - bool randomSeedValid; - uint32 randomSeed; + uint64 nSeqs = 0; + uint64 nBases = 0; - bool useGaussian; - double gMean; - double gStdDev; + bool useGaussian = true; + double gMean = 0; + double gStdDev = 0; - bool useExponential; + bool useMirror = false; + char *mirrorInput = nullptr; + double mirrorDistribution = 0.0; + uint64 mirrorDistributionLen = 0; + uint64 mirrorDistributionMax = 0; + uint64 mirrorDistributionSum = 0; - bool useMirror; - char *mirrorInput; - double mirrorDistribution; - uint64 mirrorDistributionLen; - uint64 mirrorDistributionMax; - uint64 mirrorDistributionSum; + mtRandom mt; - double aFreq; - double cFreq; - double gFreq; - double tFreq; + double aFreq = 0.25; + double cFreq = 0.25; + double gFreq = 0.25; + double tFreq = 0.25; }; diff --git a/src/seqrequester/microsatellite.C b/src/seqrequester/microsatellite.C index 04eb62e..e058274 100644 --- a/src/seqrequester/microsatellite.C +++ b/src/seqrequester/microsatellite.C @@ -17,12 +17,85 @@ */ #include "seqrequester.H" -#include "sequence.H" #include "outAT.C" #include "outGC.C" #include "outGA.C" +bool +microsatelliteParameters::parseOption(opMode &mode, int32 &arg, int32 argc, char **argv) { + + if ((strcmp(argv[arg], "microsatellite") == 0) || + (strcmp(argv[arg], "microsat") == 0)) { + mode = modeMicroSatellite; + } + + else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-prefix") == 0)) { + outPrefix = argv[++arg]; + } + + else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-window") == 0)) { + window = strtouint32(argv[++arg]); + } + + else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-ga") == 0)) { + report_ga = true; + } + else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-gc") == 0)) { + report_gc = true; + } + else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-at") == 0)) { + report_at = true; + } + + else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-legacy") == 0)) { + report_legacy = true; + } + + else { + return(false); + } + + return(true); +} + + + +void +microsatelliteParameters::showUsage(opMode mode) { + + if (mode != modeMicroSatellite) + return; + + fprintf(stderr, "OPTIONS for microsatellite mode:\n"); + fprintf(stderr, " -prefix P write output to

..bed\n"); + fprintf(stderr, " -window w compute in windows of size w; default write output to .bed\n"); + + fprintf(stderr, " -ga compute GA/TC repeat content\n"); + fprintf(stderr, " -gc compute GC repeat content\n"); + fprintf(stderr, " -at compute AT repeat content\n"); +} + + + +bool +microsatelliteParameters::checkOptions(opMode mode, vector &inputs, vector &errors) { + + if (mode != modeMicroSatellite) + return(false); + + if (inputs.size() == 0) + sprintf(errors, "ERROR: No input sequence files supplied.\n"); + + if ((report_ga == false) && + (report_gc == false) && + (report_at == false)) + sprintf(errors, "ERROR: No microsatellite pattern supplied.\n"); + + return(errors.size() > 0); +} + + void computeMicroSat(dnaSeq &seq, @@ -189,7 +262,7 @@ computeMicroSat(dnaSeqFile *seqFile, void -doMicroSatellite(vector &inputs, +doMicroSatellite(vector &inputs, microsatelliteParameters &msPar) { for (uint32 ff=0; ff &inputs, #if 0 int main(int argc, char **argv) { - char *seqName = NULL; - char *outPrefix = NULL; + char *seqName = nullptr; + char *outPrefix = nullptr; bool verbose = false; uint32 reportType = OP_NONE; int window = 128; @@ -263,7 +336,7 @@ main(int argc, char **argv) { arg++; } - if (seqName == NULL) + if (seqName == nullptr) err.push_back("No input sequence (-seq) supplied.\n"); if (err.size() > 0) { @@ -279,7 +352,7 @@ main(int argc, char **argv) { } fprintf(stderr, "Open sequence '%s'.\n", seqName); - dnaSeqFile *seqFile = NULL; + dnaSeqFile *seqFile = nullptr; seqFile = new dnaSeqFile(seqName); if (reportType == OP_GA) diff --git a/src/seqrequester/microsatellite.H b/src/seqrequester/microsatellite.H index 6b0e6f0..f8da317 100644 --- a/src/seqrequester/microsatellite.H +++ b/src/seqrequester/microsatellite.H @@ -24,19 +24,15 @@ class microsatelliteParameters { public: - microsatelliteParameters() { - } - ~microsatelliteParameters() { - } + microsatelliteParameters() {} + ~microsatelliteParameters() {} - void initialize(void) { - } + bool parseOption(opMode &mode, int32 &arg, int32 argc, char **argv); + void showUsage(opMode mode); - void finalize(void) { - } + bool checkOptions(opMode mode, std::vector &inputs, std::vector &errors); - - //char const *seqName; vector +public: char const *outPrefix = nullptr; uint32 window = 128; @@ -46,11 +42,4 @@ public: bool report_at = false; bool report_legacy = false; - - -#if 0 - char seqName[FILENAME_MAX+1]; - char distribName[FILENAME_MAX+1]; - char outputName[FILENAME_MAX+1]; -#endif }; diff --git a/src/seqrequester/mutate.C b/src/seqrequester/mutate.C index 223f02a..3a62abf 100644 --- a/src/seqrequester/mutate.C +++ b/src/seqrequester/mutate.C @@ -19,9 +19,90 @@ #include "seqrequester.H" -#include "arrays.H" -#include "sequence.H" -#include "mt19937ar.H" +bool +mutateParameters::parseOption(opMode &mode, int32 &arg, int32 argc, char **argv) { + + if (strcmp(argv[arg], "mutate") == 0) { + mode = modeMutate; + } + + else if ((mode == modeMutate) && (strcmp(argv[arg], "-s") == 0)) { + double p = strtodouble(argv[arg+1]); + char a = argv[arg+2][0]; + char b = argv[arg+3][0]; + + arg += 3; + + setProbabilitySubstititue(p, a, b); + } + + else if ((mode == modeMutate) && (strcmp(argv[arg], "-i") == 0)) { + double p = strtodouble(argv[arg+1]); + char a = argv[arg+2][0]; + + arg += 2; + + setProbabilityInsert(p, a); + } + + else if ((mode == modeMutate) && (strcmp(argv[arg], "-d") == 0)) { + double p = strtodouble(argv[arg+1]); + char a = argv[arg+2][0]; + + arg += 2; + + setProbabilityDelete(p, a); + } + + else if ((mode == modeMutate) && (strcmp(argv[arg], "-seed") == 0)) { + mt.mtSetSeed(strtouint32(argv[++arg])); + } + + else { + return(false); + } + + return(true); +} + + + +void +mutateParameters::showUsage(opMode mode) { + + if (mode != modeMicroSatellite) + return; + + fprintf(stderr, "OPTIONS for mutate mode:\n"); + fprintf(stderr, " -s p a b set the probability of substituting 'a' with 'b' to 'p'.\n"); + fprintf(stderr, " -i p a set the probability of inserting an 'a' to 'p'.\n"); + fprintf(stderr, " -d p a set the probability of deleting an 'a' to 'p'.\n"); +} + + + +bool +mutateParameters::checkOptions(opMode mode, vector &inputs, vector &errors) { + + if (mode != modeMutate) + return(false); + + for (uint32 ii=0; ii<256; ii++) + pSubstitute[ii] = 0.0; + + pInsert = 0.0; + + for (uint32 ii=0; ii<256; ii++) { + for (uint32 jj=0; jj<256; jj++) + pSubstitute[ii] += pS[ii][jj]; + + pInsert += pI[ii]; + pDelete += pD[ii]; + } + + return(errors.size() > 0); +} + void @@ -71,16 +152,12 @@ doMutate_insert(double p, void -doMutate(vector &inputs, mutateParameters &mutPar) { - mtRandom MT; - - mutPar.finalize(); - +doMutate(vector &inputs, mutateParameters &mutPar) { dnaSeq seq; uint32 nMax = 0; - char *nBases = NULL; - uint8 *nQuals = NULL; + char *nBases = nullptr; + uint8 *nQuals = nullptr; for (uint32 ff=0; ff &inputs, mutateParameters &mutPar) { // If a chance of doing something, make a random number. if (mutPar.pSubstitute[base] + mutPar.pInsert + mutPar.pD[base] > 0.0) - p = MT.mtRandomRealClosed(); + p = mutPar.mt.mtRandomRealClosed(); // If a substitution, make it. if (p < mutPar.pSubstitute[base]) { @@ -144,7 +221,7 @@ doMutate(vector &inputs, mutateParameters &mutPar) { // And one more for an inserting at the end of the read. - double p = MT.mtRandomRealClosed(); + double p = mutPar.mt.mtRandomRealClosed(); if (p < mutPar.pInsert) doMutate_insert(p, 0, 0, nBases, nQuals, oLen, mutPar); diff --git a/src/seqrequester/mutate.H b/src/seqrequester/mutate.H index 6fa0694..a61e9f0 100644 --- a/src/seqrequester/mutate.H +++ b/src/seqrequester/mutate.H @@ -40,6 +40,11 @@ public: ~mutateParameters() { }; + bool parseOption(opMode &mode, int32 &arg, int32 argc, char **argv); + void showUsage(opMode mode); + + bool checkOptions(opMode mode, std::vector &inputs, std::vector &errors); + public: void setProbabilitySubstititue(double p, char baseToReplace, char replacementBase) { fprintf(stderr, "sub %c -> %c %f\n", baseToReplace, replacementBase, p); @@ -56,22 +61,6 @@ public: pD[baseToDelete] = p; }; - void finalize(void) { - - for (uint32 ii=0; ii<256; ii++) - pSubstitute[ii] = 0.0; - - pInsert = 0.0; - - for (uint32 ii=0; ii<256; ii++) { - for (uint32 jj=0; jj<256; jj++) - pSubstitute[ii] += pS[ii][jj]; - - pInsert += pI[ii]; - pDelete += pD[ii]; - } - }; - // probability of substituting base [a] with [b] (base based) // probability of inserting an A, C, G, T, N (space based) @@ -80,11 +69,13 @@ public: // extensions: // probability of inserting ACGTN between bases a and b. // - double pS[256][256]; - double pI[256]; - double pD[256]; + double pS[256][256]; + double pI[256]; + double pD[256]; + + double pSubstitute[256]; // Probability of substituting 'a' for anything. + double pInsert; // Probability of inserting any base. + double pDelete; - double pSubstitute[256]; // Probability of substituting 'a' for anything. - double pInsert; // Probability of inserting any base. - double pDelete; + mtRandom mt; }; diff --git a/src/seqrequester/partition.C b/src/seqrequester/partition.C index 5dc68ef..73055be 100644 --- a/src/seqrequester/partition.C +++ b/src/seqrequester/partition.C @@ -18,8 +18,115 @@ */ #include "seqrequester.H" -#include "sequence.H" -#include "mt19937ar.H" + +bool +partitionParameters::parseOption(opMode &mode, int32 &arg, int32 argc, char **argv) { + + if (strcmp(argv[arg], "partition") == 0) { + mode = modePartition; + } + + else if ((mode == modePartition) && (strcmp(argv[arg], "-paired") == 0)) { + isPaired = true; + } + + else if ((mode == modePartition) && (strcmp(argv[arg], "-output") == 0)) { + strncpy(output1, argv[++arg], FILENAME_MAX); // #'s in the name will be replaced + strncpy(output2, argv[ arg], FILENAME_MAX); // by '1' or '2' later. + } + + else if ((mode == modePartition) && (strcmp(argv[arg], "-writers") == 0)) { + numWriters = strtouint32(argv[++arg]); + } + + else if ((mode == modePartition) && (strcmp(argv[arg], "-fasta") == 0)) { + outputFASTA = true; + } + else if ((mode == modePartition) && (strcmp(argv[arg], "-fastq") == 0)) { + outputFASTQ = true; + + if ((arg+1 < argc) && ('0' <= argv[arg+1][0]) && (argv[arg+1][0] <= '9')) + outputQV = strtouint32(argv[++arg]); + } + + else if ((mode == modePartition) && (strcmp(argv[arg], "-coverage") == 0)) { // Sample reads up to some coverage C + desiredCoverage = strtodouble(argv[++arg]); + } + + else if ((mode == modePartition) && (strcmp(argv[arg], "-genomesize") == 0)) { + genomeSize = strtouint64(argv[++arg]); + } + + else if ((mode == modePartition) && (strcmp(argv[arg], "-bases") == 0)) { // Sample B bases + desiredNumBases = strtouint64(argv[++arg]); + } + + else if ((mode == modePartition) && (strcmp(argv[arg], "-reads") == 0)) { // Sample N reads + desiredNumReads = strtouint64(argv[++arg]); + } + + else if ((mode == modePartition) && (strcmp(argv[arg], "-pairs") == 0)) { // Sample N pairs of reads + desiredNumReads = strtouint64(argv[++arg]) * 2; + } + + else { + return(false); + } + + return(true); +} + + + +void +partitionParameters::showUsage(opMode mode) { + + if (mode != modePartition) + return; + + fprintf(stderr, "OPTIONS for partition mode:\n"); + fprintf(stderr, " -paired treat inputs as paired sequences; the first two files form the\n"); + fprintf(stderr, " first pair, and so on.\n"); + fprintf(stderr, " NOT IMPLEMENTED\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -output O write output sequences to file O. If paired, two files must be supplied.\n"); + fprintf(stderr, " -fasta write output as FASTA\n"); + fprintf(stderr, " -fastq [q] write output as FASTQ; if no quality values, use q (integer, 0-based) for all\n"); + fprintf(stderr, " -writers N use N threads for writing (compressed) output. The last N files will\n"); + fprintf(stderr, " be smaller than requested.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -coverage C output C coverage of sequences, based on genome size G.\n"); + fprintf(stderr, " -genomesize G \n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -bases B output B bases.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -reads R output R reads.\n"); + fprintf(stderr, " -pairs P output P pairs (only if -paired).\n"); + fprintf(stderr, "\n"); +} + + + +bool +partitionParameters::checkOptions(opMode mode, vector &inputs, vector &errors) { + + if (mode != modePartition) + return(false); + + if (inputs.size() == 0) + sprintf(errors, "ERROR: No input sequence files supplied.\n"); + + if ((output1[0] == 0) && + (output2[0] == 0)) + sprintf(errors, "ERROR: No output pattern (-output) supplied.\n"); + + if ((desiredCoverage == 0) && + (desiredNumReads == 0) && + (desiredNumBases == 0)) + sprintf(errors, "ERROR: No partitioning size (-coverage, -bases, -reads) supplied.\n"); + + return(errors.size() > 0); +} @@ -38,7 +145,7 @@ openOutput(char const *pattern, uint32 index) { fprintf(stderr, "ERROR: Failed to find '#' in output pattern '%s', and asked to make multiple outputs.\n", pattern), exit(1); if (pattern[ap] == 0) - return(new compressedFileWriter(pattern)); + return(new compressedFileWriter(pattern, 9)); // We've got #'s in the string. We want to replace the last block of 'em // with digits (ap found above is the start of the first block, sigh). @@ -83,7 +190,7 @@ openOutput(char const *pattern, uint32 index) { name[ii] = digs[7 - dp--]; } - return(new compressedFileWriter(name)); + return(new compressedFileWriter(name, 9)); } @@ -91,9 +198,7 @@ openOutput(char const *pattern, uint32 index) { void -doPartition_single(vector &inputs, partitionParameters &parPar) { - - parPar.initialize(); +doPartition_single(vector &inputs, partitionParameters &parPar) { // Figure out how many bases/sequences to write in each output file. // @@ -185,7 +290,7 @@ doPartition_single(vector &inputs, partitionParameters &parPar) { void -doPartition(vector &inputs, partitionParameters &parPar) { +doPartition(vector &inputs, partitionParameters &parPar) { if (parPar.isPaired == false) doPartition_single(inputs, parPar); diff --git a/src/seqrequester/partition.H b/src/seqrequester/partition.H index 21631c5..27d2615 100644 --- a/src/seqrequester/partition.H +++ b/src/seqrequester/partition.H @@ -24,59 +24,34 @@ class partitionParameters { public: - partitionParameters() { - isPaired = false; + partitionParameters() {} + ~partitionParameters() {} - desiredCoverage = 0.0; - genomeSize = 0; + bool parseOption(opMode &mode, int32 &arg, int32 argc, char **argv); + void showUsage(opMode mode); - desiredNumReads = 0; - desiredNumBases = 0; + bool checkOptions(opMode mode, std::vector &inputs, std::vector &errors); - desiredFraction = 0.0; + bool isPaired = false; - outputFASTA = false; - outputFASTQ = false; - outputQV = 20; + double desiredCoverage = 0.0; + uint64 genomeSize = 0; - numWriters = 4; + uint64 desiredNumReads = uint64max; + uint64 desiredNumBases = uint64max; - memset(output1, 0, FILENAME_MAX+1); - memset(output2, 0, FILENAME_MAX+1); - } + double desiredFraction = 0.0; - ~partitionParameters() { - } + bool randomSeedValid = false; + uint32 randomSeed = 0; + bool outputFASTA = false; + bool outputFASTQ = false; + uint8 outputQV = 20; - void initialize(void) { - }; + uint32 numWriters = 4; - - void finalize(void) { - } - - - bool isPaired; - - double desiredCoverage; - uint64 genomeSize; - - uint64 desiredNumReads; - uint64 desiredNumBases; - - double desiredFraction; - - bool randomSeedValid; - uint32 randomSeed; - - bool outputFASTA; - bool outputFASTQ; - uint8 outputQV; - - uint32 numWriters; - - char output1[FILENAME_MAX+1]; - char output2[FILENAME_MAX+1]; + char output1[FILENAME_MAX+1] = {0}; + char output2[FILENAME_MAX+1] = {0}; }; diff --git a/src/seqrequester/sample.C b/src/seqrequester/sample.C index 9291c32..41e7a3a 100644 --- a/src/seqrequester/sample.C +++ b/src/seqrequester/sample.C @@ -18,15 +18,126 @@ */ #include "seqrequester.H" -#include "sequence.H" -#include "mt19937ar.H" + +bool +sampleParameters::parseOption(opMode &mode, int32 &arg, int32 argc, char **argv) { + + if (strcmp(argv[arg], "sample") == 0) { + mode = modeSample; + } + + else if ((mode == modeSample) && (strcmp(argv[arg], "-paired") == 0)) { + isPaired = true; + } + + else if ((mode == modeSample) && (strcmp(argv[arg], "-copies") == 0)) { + numCopies = strtouint32(argv[++arg]); + } + + else if ((mode == modeSample) && (strcmp(argv[arg], "-output") == 0)) { + //strncpy(output1, argv[++arg], FILENAME_MAX); // #'s in the name will be replaced + //strncpy(output2, argv[ arg], FILENAME_MAX); // by '1' or '2' later. + output1 = argv[++arg]; + output2 = argv[ arg]; + } + + else if ((mode == modeSample) && (strcmp(argv[arg], "-fasta") == 0)) { + outputFASTA = true; + } + else if ((mode == modeSample) && (strcmp(argv[arg], "-fastq") == 0)) { + outputFASTQ = true; + + if ((arg+1 < argc) && ('0' <= argv[arg+1][0]) && (argv[arg+1][0] <= '9')) + outputQV = strtouint32(argv[++arg]); + } + + else if ((mode == modeSample) && (strcmp(argv[arg], "-coverage") == 0)) { // Sample reads up to some coverage C + desiredCoverage = strtodouble(argv[++arg]); + } + + else if ((mode == modeSample) && (strcmp(argv[arg], "-genomesize") == 0)) { + genomeSize = strtouint64(argv[++arg]); + } + + else if ((mode == modeSample) && (strcmp(argv[arg], "-bases") == 0)) { // Sample B bases + desiredNumBases = strtouint64(argv[++arg]); + } + + else if ((mode == modeSample) && (strcmp(argv[arg], "-reads") == 0)) { // Sample N reads + desiredNumReads = strtouint64(argv[++arg]); + } + + else if ((mode == modeSample) && (strcmp(argv[arg], "-pairs") == 0)) { // Sample N pairs of reads + desiredNumReads = strtouint64(argv[++arg]) * 2; + } + + else if ((mode == modeSample) && (strcmp(argv[arg], "-fraction") == 0)) { // Sample F fraction + desiredFraction = strtodouble(argv[++arg]); + } + + else if ((mode == modeSample) && (strcmp(argv[arg], "-seed") == 0)) { // Seed for pseudo random number generator + mt.mtSetSeed(strtouint32(argv[++arg])); + } + + else { + return(false); + } + + return(true); +} + + + +void +sampleParameters::showUsage(opMode mode) { + + if (mode != modeSample) + return; + + fprintf(stderr, "OPTIONS for sample mode:\n"); + fprintf(stderr, " -paired treat inputs as paired sequences; the first two files form the\n"); + fprintf(stderr, " first pair, and so on.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -copies C write C different copies of the sampling (without replacement).\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -output O write output sequences to file O. If paired, two files must be supplied.\n"); + fprintf(stderr, " -fasta write output as FASTA\n"); + fprintf(stderr, " -fastq [q] write output as FASTQ; if no quality values, use q (integer, 0-based) for all\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -coverage C output C coverage of sequences, based on genome size G.\n"); + fprintf(stderr, " -genomesize G \n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -bases B output B bases.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -reads R output R reads.\n"); + fprintf(stderr, " -pairs P output P pairs (only if -paired).\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -fraction F output fraction F of the input bases.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -seed s seed the pseudo-random number generate with integer 's'\n"); + fprintf(stderr, "\n"); +} + + + +bool +sampleParameters::checkOptions(opMode mode, vector &inputs, vector &errors) { + + if (mode != modeSample) + return(false); + + if (inputs.size() == 0) + sprintf(errors, "ERROR: No input sequence files supplied.\n"); + + return(errors.size() > 0); +} class seqEntry { public: - seqEntry(mtRandom &MT, uint64 pos_, uint32 len_) { - rnd = MT.mtRandomRealOpen(); + seqEntry(mtRandom &mt, uint64 pos_, uint32 len_) { + rnd = mt.mtRandomRealOpen(); pos = pos_; len = len_; out = UINT32_MAX; @@ -165,9 +276,7 @@ doSample_sample(sampleParameters &samPar, void -doSample_paired(vector &inputs, sampleParameters &samPar) { - - samPar.initialize(); +doSample_paired(vector &inputs, sampleParameters &samPar) { vector numSeqsPerFile; vector seqOrder; @@ -179,8 +288,6 @@ doSample_paired(vector &inputs, sampleParameters &samPar) { vector sequences; vector qualities; - mtRandom MT; - // No support for multiple copies. if (samPar.numCopies != 1) @@ -195,16 +302,16 @@ doSample_paired(vector &inputs, sampleParameters &samPar) { char *a = strrchr(samPar.output1, '#'); char *b = strrchr(samPar.output2, '#'); - if (a == NULL) + if (a == nullptr) fprintf(stderr, "ERROR: Failed to find '#' in output name '%s'\n", samPar.output1), exit(1); - if (b == NULL) + if (b == nullptr) fprintf(stderr, "ERROR: Failed to find '#' in output name '%s'\n", samPar.output2), exit(1); *a = '1'; *b = '2'; - outFile1 = new compressedFileWriter(samPar.output1); - outFile2 = new compressedFileWriter(samPar.output2); + outFile1 = new compressedFileWriter(samPar.output1, 9); + outFile2 = new compressedFileWriter(samPar.output2, 9); } // Scan the inputs, saving the number of sequences in each and the length of each sequence. @@ -222,7 +329,7 @@ doSample_paired(vector &inputs, sampleParameters &samPar) { while ((sf1more == true) && (sf2more == true)) { - seqOrder.push_back(seqEntry(MT, numSeqsTotal, seq1.length() + seq2.length())); + seqOrder.push_back(seqEntry(samPar.mt, numSeqsTotal, seq1.length() + seq2.length())); numSeqsTotal += 1; numBasesTotal += seq1.length() + seq2.length(); @@ -293,7 +400,7 @@ doSample_single_openOutput(sampleParameters &samPar, uint32 ii) { fprintf(stderr, "ERROR: Failed to find '#' in output name '%s', and asked to make multiple copies.\n", samPar.output1), exit(1); if (samPar.output1[ap] == 0) - return(new compressedFileWriter(samPar.output1)); + return(new compressedFileWriter(samPar.output1, 9)); // We've got #'s in the string. We want to replace the last block of 'em // with digits (ap found above is the start of the first block, sigh). @@ -338,16 +445,13 @@ doSample_single_openOutput(sampleParameters &samPar, uint32 ii) { name[ii] = digs[7 - dp--]; } - return(new compressedFileWriter(name)); + return(new compressedFileWriter(name, 9)); } void -doSample_single(vector &inputs, sampleParameters &samPar) { - - samPar.initialize(); - +doSample_single(vector &inputs, sampleParameters &samPar) { vector numSeqsPerFile; vector seqOrder; @@ -358,11 +462,6 @@ doSample_single(vector &inputs, sampleParameters &samPar) { vector sequences; vector qualities; - mtRandom MT; - - if (samPar.randomSeedValid) - MT.mtSetSeed(samPar.randomSeed); - // Open output files. If paired, replace #'s in the output names with 1 or 2. fprintf(stderr, "Opening %u output file%s.\n", samPar.numCopies, (samPar.numCopies == 1) ? "" : "s"); @@ -383,7 +482,7 @@ doSample_single(vector &inputs, sampleParameters &samPar) { uint64 num = 0; while (sf1->loadSequence(seq1)) { - seqOrder.push_back(seqEntry(MT, numSeqsTotal, seq1.length())); + seqOrder.push_back(seqEntry(samPar.mt, numSeqsTotal, seq1.length())); numSeqsTotal += 1; numBasesTotal += seq1.length(); @@ -446,7 +545,7 @@ doSample_single(vector &inputs, sampleParameters &samPar) { void -doSample(vector &inputs, sampleParameters &samPar) { +doSample(vector &inputs, sampleParameters &samPar) { if (samPar.isPaired == false) doSample_single(inputs, samPar); diff --git a/src/seqrequester/sample.H b/src/seqrequester/sample.H index 02cd34d..ded22b1 100644 --- a/src/seqrequester/sample.H +++ b/src/seqrequester/sample.H @@ -25,60 +25,35 @@ class sampleParameters { public: sampleParameters() { - isPaired = false; - - numCopies = 1; - - desiredCoverage = 0.0; - genomeSize = 0; - - desiredNumReads = 0; - desiredNumBases = 0; - - desiredFraction = 0.0; - - randomSeedValid = false; - randomSeed = 0; - - outputFASTA = false; - outputFASTQ = false; - outputQV = 20; - - memset(output1, 0, FILENAME_MAX+1); - memset(output2, 0, FILENAME_MAX+1); - } - - ~sampleParameters() { + //output1 = new char [FILENAME_MAX+1]; + //output2 = new char [FILENAME_MAX+1]; } + ~sampleParameters() {} + bool parseOption(opMode &mode, int32 &arg, int32 argc, char **argv); + void showUsage(opMode mode); - void initialize(void) { - }; + bool checkOptions(opMode mode, vector &inputs, vector &errors); +public: + bool isPaired = false; - void finalize(void) { - } - - - bool isPaired; - - uint32 numCopies; + uint32 numCopies = 1; - double desiredCoverage; - uint64 genomeSize; + double desiredCoverage = 0.0; + uint64 genomeSize = 0; - uint64 desiredNumReads; - uint64 desiredNumBases; + uint64 desiredNumReads = 0; + uint64 desiredNumBases = 0; - double desiredFraction; + double desiredFraction = 0.0; - bool randomSeedValid; - uint32 randomSeed; + mtRandom mt; - bool outputFASTA; - bool outputFASTQ; - uint8 outputQV; + bool outputFASTA = false; + bool outputFASTQ = false; + uint8 outputQV = 20; - char output1[FILENAME_MAX+1]; - char output2[FILENAME_MAX+1]; + char const *output1 = nullptr;//[FILENAME_MAX+1] = {0}; + char const *output2 = nullptr;//[FILENAME_MAX+1] = {0}; }; diff --git a/src/seqrequester/seqrequester.C b/src/seqrequester/seqrequester.C index 29b8a78..64f0520 100644 --- a/src/seqrequester/seqrequester.C +++ b/src/seqrequester/seqrequester.C @@ -18,580 +18,77 @@ */ #include "seqrequester.H" -#include "files.H" -#include "strings.H" +bool +isInput(int32 arg, int32 argc, char **argv, std::vector &inputs) { -enum opMode { - modeSummarize, // Summarize sequences in FASTA or FASTQ inputs - modeExtract, // Extract sequences or subsequences from FASTA or FASTQ inputs - modeGenerate, // Generate random sequences - modeSimulate, // Simulate reads from FASTA or FASTQ inputs (ontigs/scaffolds/chromosomes) - modePartition, // Rewrite inputs into fixed size outputs - modeSample, // Extract random sequences from FASTA or FASTQ inputs - modeShift, // Generate sequence based on a shift register - modeMicroSatellite, // Find microsatellite repeats - modeMutate, // Randomly mutate bases - modeUnset // Cause an error -}; - + if ((strcmp(argv[arg], "-") == 0) || + (fileExists(argv[arg]) == true)) { + inputs.push_back(argv[arg]); + return(true); + } + return(false); +} int main(int argc, char **argv) { - vector inputs; - opMode mode = modeUnset; - summarizeParameters sumPar; extractParameters extPar; generateParameters genPar; + microsatelliteParameters msPar; + mutateParameters mutPar; partitionParameters parPar; sampleParameters samPar; - simulateParameters simPar; shiftRegisterParameters srPar; - microsatelliteParameters msPar; - mutateParameters mutPar; - - argc = AS_configure(argc, argv); - - vector err; - int arg = 1; - while (arg < argc) { - - // SUMMARIZE - - if (strcmp(argv[arg], "summarize") == 0) { - mode = modeSummarize; - } - - else if ((mode == modeSummarize) && (strcmp(argv[arg], "-size") == 0)) { - sumPar.genomeSize = strtoull(argv[++arg], NULL, 10); - } - - else if ((mode == modeSummarize) && (strcmp(argv[arg], "-1x") == 0)) { - sumPar.limitTo1x = true; - } - - else if ((mode == modeSummarize) && (strcmp(argv[arg], "-split-n") == 0)) { - sumPar.breakAtN = true; - } - - else if ((mode == modeSummarize) && (strcmp(argv[arg], "-simple") == 0)) { - sumPar.asSimple = true; - } - - else if ((mode == modeSummarize) && (strcmp(argv[arg], "-lengths") == 0)) { - sumPar.asLength = true; - } - - else if ((mode == modeSummarize) && (strcmp(argv[arg], "-assequences") == 0)) { - sumPar.asSequences = true; - sumPar.asBases = false; - } - - else if ((mode == modeSummarize) && (strcmp(argv[arg], "-asbases") == 0)) { - sumPar.asSequences = false; - sumPar.asBases = true; - } - - // EXTRACT - - else if (strcmp(argv[arg], "extract") == 0) { - mode = modeExtract; - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-bases") == 0)) { - decodeRange(argv[++arg], extPar.baseBgn, extPar.baseEnd); - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-sequences") == 0)) { - extPar.seqsArgs.push_back(argv[++arg]); - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-reverse") == 0)) { - extPar.asReverse = true; - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-complement") == 0)) { - extPar.asComplement = true; - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-rc") == 0)) { - extPar.asReverse = true; - extPar.asComplement = true; - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-upper") == 0)) { - extPar.asUpperCase = true; - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-lower") == 0)) { - extPar.asLowerCase = true; - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-compress") == 0)) { - extPar.asCompressed = true; - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-length") == 0)) { - extPar.lensArgs.push_back(argv[++arg]); - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-lowermask") == 0)) { - extPar.doMasking = true; - extPar.maskWithN = false; - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-nmask") == 0)) { - extPar.doMasking = true; - extPar.maskWithN = true; - } - - else if ((mode == modeExtract) && (strcmp(argv[arg], "-fasta") == 0)) { - extPar.outputFASTA = true; - } - else if ((mode == modeExtract) && (strcmp(argv[arg], "-fastq") == 0)) { - extPar.outputFASTQ = true; - - if ((arg+1 < argc) && ('0' <= argv[arg+1][0]) && (argv[arg+1][0] <= '9')) - extPar.outputQV = strtouint32(argv[++arg]); - } - - // GENERATE - - else if (strcmp(argv[arg], "generate") == 0) { - mode = modeGenerate; - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-seed") == 0)) { - genPar.randomSeedValid = true; - genPar.randomSeed = strtouint32(argv[++arg]); - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-ident") == 0)) { - genPar.ident = argv[++arg]; - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-min") == 0)) { - genPar.minLength = strtouint64(argv[++arg]); - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-max") == 0)) { - genPar.maxLength = strtouint64(argv[++arg]); - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-sequences") == 0)) { - genPar.nSeqs = strtouint64(argv[++arg]); - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-bases") == 0)) { - genPar.nBases = strtouint64(argv[++arg]); - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-guassian") == 0)) { - genPar.useGaussian = true; - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-mirror") == 0)) { - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-gc") == 0)) { - double gc = strtodouble(argv[++arg]); - double at = 1.0 - gc; - - genPar.gFreq = genPar.cFreq = gc / 2.0; - genPar.aFreq = genPar.tFreq = at / 2.0; - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-at") == 0)) { - double at = strtodouble(argv[++arg]); - double gc = 1.0 - at; - - genPar.gFreq = genPar.cFreq = gc / 2.0; - genPar.aFreq = genPar.tFreq = at / 2.0; - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-a") == 0) ){ - genPar.aFreq = strtodouble(argv[++arg]); - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-c") == 0)) { - genPar.cFreq = strtodouble(argv[++arg]); - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-g") == 0)) { - genPar.gFreq = strtodouble(argv[++arg]); - } - - else if ((mode == modeGenerate) && (strcmp(argv[arg], "-t") == 0)) { - genPar.tFreq = strtodouble(argv[++arg]); - } - - // SIMULATE - - else if (strcmp(argv[arg], "simulate") == 0) { - mode = modeSimulate; - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-genomesize") == 0)) { - simPar.genomeSize = strtouint64(argv[++arg]); - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-coverage") == 0)) { - simPar.desiredCoverage = strtodouble(argv[++arg]); - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-nreads") == 0)) { - simPar.desiredNumReads = strtouint64(argv[++arg]); - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-nbases") == 0)) { - simPar.desiredNumBases = strtouint64(argv[++arg]); - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-circular") == 0)) { - simPar.circular = true; - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-truncate") == 0)) { - simPar.truncate = true; - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-genome") == 0)) { - strncpy(simPar.genomeName, argv[++arg], FILENAME_MAX); - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-distribution") == 0)) { - strncpy(simPar.distribName, argv[++arg], FILENAME_MAX); - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-length") == 0)) { - decodeRange(argv[++arg], simPar.desiredMinLength, simPar.desiredMaxLength); - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-reverse") == 0)) { - simPar.rcProb = strtodouble(argv[++arg]); - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-seed") == 0)) { - simPar.randomSeedValid = true; - simPar.randomSeed = strtouint32(argv[++arg]); - } - - - //else if ((mode == modeSimulate) && (strcmp(argv[arg], "-name") == 0)) { - // strncpy(simPar.sequenceName, argv[++arg], FILENAME_MAX); - //} - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-output") == 0)) { - strncpy(simPar.outputName, argv[++arg], FILENAME_MAX); - } - - else if ((mode == modeSimulate) && (strcmp(argv[arg], "-test") == 0)) { - simPar.test = true; - } - - // PARTITION - - else if (strcmp(argv[arg], "partition") == 0) { - mode = modePartition; - } - - else if ((mode == modePartition) && (strcmp(argv[arg], "-paired") == 0)) { - parPar.isPaired = true; - } - - else if ((mode == modePartition) && (strcmp(argv[arg], "-output") == 0)) { - strncpy(parPar.output1, argv[++arg], FILENAME_MAX); // #'s in the name will be replaced - strncpy(parPar.output2, argv[ arg], FILENAME_MAX); // by '1' or '2' later. - } - - else if ((mode == modePartition) && (strcmp(argv[arg], "-writers") == 0)) { - parPar.numWriters = strtouint32(argv[++arg]); - } - - else if ((mode == modePartition) && (strcmp(argv[arg], "-fasta") == 0)) { - parPar.outputFASTA = true; - } - else if ((mode == modePartition) && (strcmp(argv[arg], "-fastq") == 0)) { - parPar.outputFASTQ = true; - - if ((arg+1 < argc) && ('0' <= argv[arg+1][0]) && (argv[arg+1][0] <= '9')) - parPar.outputQV = strtouint32(argv[++arg]); - } - - else if ((mode == modePartition) && (strcmp(argv[arg], "-coverage") == 0)) { // Sample reads up to some coverage C - parPar.desiredCoverage = strtodouble(argv[++arg]); - } - - else if ((mode == modePartition) && (strcmp(argv[arg], "-genomesize") == 0)) { - parPar.genomeSize = strtouint64(argv[++arg]); - } - - else if ((mode == modePartition) && (strcmp(argv[arg], "-bases") == 0)) { // Sample B bases - parPar.desiredNumBases = strtouint64(argv[++arg]); - } - - else if ((mode == modePartition) && (strcmp(argv[arg], "-reads") == 0)) { // Sample N reads - parPar.desiredNumReads = strtouint64(argv[++arg]); - } - - else if ((mode == modePartition) && (strcmp(argv[arg], "-pairs") == 0)) { // Sample N pairs of reads - parPar.desiredNumReads = strtouint64(argv[++arg]) * 2; - } - - // SAMPLE - - else if (strcmp(argv[arg], "sample") == 0) { - mode = modeSample; - } - - else if ((mode == modeSample) && (strcmp(argv[arg], "-paired") == 0)) { - samPar.isPaired = true; - } - - else if ((mode == modeSample) && (strcmp(argv[arg], "-copies") == 0)) { - samPar.numCopies = strtouint32(argv[++arg]); - } - - else if ((mode == modeSample) && (strcmp(argv[arg], "-output") == 0)) { - strncpy(samPar.output1, argv[++arg], FILENAME_MAX); // #'s in the name will be replaced - strncpy(samPar.output2, argv[ arg], FILENAME_MAX); // by '1' or '2' later. - } - - else if ((mode == modeSample) && (strcmp(argv[arg], "-fasta") == 0)) { - samPar.outputFASTA = true; - } - else if ((mode == modeSample) && (strcmp(argv[arg], "-fastq") == 0)) { - samPar.outputFASTQ = true; - - if ((arg+1 < argc) && ('0' <= argv[arg+1][0]) && (argv[arg+1][0] <= '9')) - samPar.outputQV = strtouint32(argv[++arg]); - } - - else if ((mode == modeSample) && (strcmp(argv[arg], "-coverage") == 0)) { // Sample reads up to some coverage C - samPar.desiredCoverage = strtodouble(argv[++arg]); - } - - else if ((mode == modeSample) && (strcmp(argv[arg], "-genomesize") == 0)) { - samPar.genomeSize = strtouint64(argv[++arg]); - } - - else if ((mode == modeSample) && (strcmp(argv[arg], "-bases") == 0)) { // Sample B bases - samPar.desiredNumBases = strtouint64(argv[++arg]); - } - - else if ((mode == modeSample) && (strcmp(argv[arg], "-reads") == 0)) { // Sample N reads - samPar.desiredNumReads = strtouint64(argv[++arg]); - } - - else if ((mode == modeSample) && (strcmp(argv[arg], "-pairs") == 0)) { // Sample N pairs of reads - samPar.desiredNumReads = strtouint64(argv[++arg]) * 2; - } - - else if ((mode == modeSample) && (strcmp(argv[arg], "-fraction") == 0)) { // Sample F fraction - samPar.desiredFraction = strtodouble(argv[++arg]); - } - - else if ((mode == modeSample) && (strcmp(argv[arg], "-seed") == 0)) { // Seed for pseudo random number generator - samPar.randomSeedValid = true; - samPar.randomSeed = strtouint32(argv[++arg]); - } - - // SHIFT - - else if (strcmp(argv[arg], "shift") == 0) { - mode = modeShift; - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "-order") == 0)) { - srPar.order = strtouint32(argv[++arg]); - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "-weight") == 0)) { - srPar.weight = strtouint32(argv[++arg]); - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "-emit") == 0)) { - srPar.search = false; - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "-length") == 0)) { - srPar.length = strtouint64(argv[++arg]); - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "-search") == 0)) { - srPar.search = true; - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "-report") == 0)) { - srPar.report = strtodouble(argv[++arg]); - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "-fast") == 0)) { - srPar.fast = true; - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "-state") == 0)) { // Initial sequence - strcpy(srPar.sr, argv[++arg]); // ACGTGGTAA - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "-tap") == 0)) { // SR control bits - strcpy(srPar.svmin, argv[++arg]); // 011010011 - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "-tapmin") == 0)) { // SR control bits - strcpy(srPar.svmin, argv[++arg]); // 011010011 - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "-tapmax") == 0)) { // SR control bits - strcpy(srPar.svmax, argv[++arg]); // 011010011 - } - - else if ((mode == modeShift) && (strcmp(argv[arg], "") == 0)) { - } - - // MICROSATELLITE - - else if ((strcmp(argv[arg], "microsatellite") == 0) || - (strcmp(argv[arg], "microsat") == 0)) { - mode = modeMicroSatellite; - } - - else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-prefix") == 0)) { - msPar.outPrefix = argv[++arg]; - } - - else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-window") == 0)) { - msPar.window = strtouint32(argv[++arg]); - } - - else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-ga") == 0)) { - msPar.report_ga = true; - } - else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-gc") == 0)) { - msPar.report_gc = true; - } - else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-at") == 0)) { - msPar.report_at = true; - } - - else if ((mode == modeMicroSatellite) && (strcmp(argv[arg], "-legacy") == 0)) { - msPar.report_legacy = true; - } - - // MUTATE - - else if (strcmp(argv[arg], "mutate") == 0) { - mode = modeMutate; - } - - else if ((mode == modeMutate) && (strcmp(argv[arg], "-s") == 0)) { - double p = strtodouble(argv[arg+1]); - char a = argv[arg+2][0]; - char b = argv[arg+3][0]; - - arg += 3; - - mutPar.setProbabilitySubstititue(p, a, b); - } - - else if ((mode == modeMutate) && (strcmp(argv[arg], "-i") == 0)) { - double p = strtodouble(argv[arg+1]); - char a = argv[arg+2][0]; - - arg += 2; - - mutPar.setProbabilityInsert(p, a); - } - - else if ((mode == modeMutate) && (strcmp(argv[arg], "-d") == 0)) { - double p = strtodouble(argv[arg+1]); - char a = argv[arg+2][0]; - - arg += 2; - - mutPar.setProbabilityDelete(p, a); - } - - // INPUTS - - else if (strcmp(argv[arg], "-") == 0) { - inputs.push_back(argv[arg]); - } - - else if (fileExists(argv[arg]) == true) { - inputs.push_back(argv[arg]); - } - - else { - char *s = new char [1024]; - snprintf(s, 1024, "ERROR: Unknown parameter '%s'\n", argv[arg]); - err.push_back(s); - } - arg++; - } - - // Check for required options. - - if (mode == modeUnset) { - err.push_back("ERROR: No mode (summarize, extract, generate or simulate) specified.\n"); - } - - if (mode == modeSummarize) { - if (inputs.size() == 0) - err.push_back("ERROR: No input sequence files supplied.\n"); - } - - if (mode == modeExtract) { - if (inputs.size() == 0) - err.push_back("ERROR: No input sequence files supplied.\n"); - } - - if (mode == modeGenerate) { - } - - if (mode == modeSimulate) { - if (simPar.genomeName[0] == 0) - err.push_back("ERROR: No reference genome sequence (-genome) supplied.\n"); - } - - if (mode == modePartition) { - if (inputs.size() == 0) - err.push_back("ERROR: No input sequence files supplied.\n"); - if ((parPar.output1[0] == 0) && - (parPar.output2[0] == 0)) - err.push_back("ERROR: No output pattern (-output) supplied.\n"); - if ((parPar.desiredCoverage == 0) && - (parPar.desiredNumReads == 0) && - (parPar.desiredNumBases == 0)) - err.push_back("ERROR: No partitioning size (-coverage, -bases, -reads) supplied.\n"); - } - - if (mode == modeSample) { - if (inputs.size() == 0) - err.push_back("ERROR: No input sequence files supplied.\n"); - } + simulateParameters simPar; + summarizeParameters sumPar; - if (mode == modeShift) { - } + std::vector inputs; + std::vector errors; - if (mode == modeMutate) { - } + argc = AS_configure(argc, argv); - if (mode == modeMicroSatellite) { - if (inputs.size() == 0) - err.push_back("ERROR: No input sequence files supplied.\n"); - if ((msPar.report_ga == false) && - (msPar.report_gc == false) && - (msPar.report_at == false)) - err.push_back("ERROR: No microsatellite pattern supplied.\n"); + // parseOption() returns true if the option (and potentially some words + // following it) are consumed. + + for (int arg=1; arg < argc; arg++) + if ((extPar.parseOption(mode, arg, argc, argv) == false) && + (genPar.parseOption(mode, arg, argc, argv) == false) && + ( msPar.parseOption(mode, arg, argc, argv) == false) && + (mutPar.parseOption(mode, arg, argc, argv) == false) && + (parPar.parseOption(mode, arg, argc, argv) == false) && + (samPar.parseOption(mode, arg, argc, argv) == false) && + ( srPar.parseOption(mode, arg, argc, argv) == false) && + (simPar.parseOption(mode, arg, argc, argv) == false) && + (sumPar.parseOption(mode, arg, argc, argv) == false) && + (isInput(arg, argc, argv, inputs) == false)) + sprintf(errors, "ERROR: Unknown parameter '%s'\n", argv[arg]); + + if (mode == modeUnset) + sprintf(errors, "ERROR: No mode (summarize, extract, generate, et cetera) specified.\n"); + + // Check for required options. checkOptions() returns true if any of the + // supplied options generate errors; false if there are no errors or + // the mode is not active. + + if ((extPar.checkOptions(mode, inputs, errors) == true) || + (genPar.checkOptions(mode, inputs, errors) == true) || + ( msPar.checkOptions(mode, inputs, errors) == true) || + (mutPar.checkOptions(mode, inputs, errors) == true) || + (parPar.checkOptions(mode, inputs, errors) == true) || + (samPar.checkOptions(mode, inputs, errors) == true) || + ( srPar.checkOptions(mode, inputs, errors) == true) || + (simPar.checkOptions(mode, inputs, errors) == true) || + (sumPar.checkOptions(mode, inputs, errors) == true) || + (mode == modeUnset)) { + // And do what? } // If errors, report usage. - if (err.size() > 0) { + if (errors.size() > 0) { fprintf(stderr, "usage: %s [mode] [options] [sequence_file ...]\n", argv[0]); fprintf(stderr, "\n"); @@ -606,223 +103,35 @@ main(int argc, char **argv) { fprintf(stderr, "\n"); } - if ((mode == modeUnset) || (mode == modeSummarize)) { - fprintf(stderr, "OPTIONS for summarize mode:\n"); - fprintf(stderr, " -size base size to use for N50 statistics\n"); - fprintf(stderr, " -1x limit NG table to 1x coverage\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -split-n split sequences at N bases before computing length\n"); - fprintf(stderr, " -simple output a simple 'length numSequences' histogram\n"); - fprintf(stderr, " -lengths output a list of the sequence lengths\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -assequences load data as complete sequences (for testing)\n"); - fprintf(stderr, " -asbases load data as blocks of bases (for testing)\n"); - fprintf(stderr, "\n"); - } - - if ((mode == modeUnset) || (mode == modeExtract)) { - fprintf(stderr, "OPTIONS for extract mode:\n"); - fprintf(stderr, " -bases baselist extract bases specified in the 'baselist' from\n"); - fprintf(stderr, " each sequence\n"); - fprintf(stderr, " -sequences seqlist extract ordinal sequences specified in the 'seqlist'\n"); - fprintf(stderr, " -sequences namefile extract sequences with names listed in 'namefile'\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -reverse reverse the bases in the sequence\n"); - fprintf(stderr, " -complement complement the bases in the sequence\n"); - fprintf(stderr, " -rc alias for -reverse -complement\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -compress compress homopolymer runs to one base\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -upcase\n"); - fprintf(stderr, " -downcase\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -fasta write output as FASTA\n"); - fprintf(stderr, " -fastq [q] write output as FASTQ; if no quality values, use q\n"); - fprintf(stderr, " (integer, 0-based) for all bases; default q=20\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -length min-max print sequence if it is at least 'min' bases and at\n"); - fprintf(stderr, " most 'max' bases long\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " a 'baselist' is a set of integers formed from any combination\n"); - fprintf(stderr, " of the following, seperated by a comma:\n"); - fprintf(stderr, " num a single number\n"); - fprintf(stderr, " bgn-end a range of numbers: bgn <= end\n"); - fprintf(stderr, " bases are spaced-based; -bases 0-2,4 will print the bases between\n"); - fprintf(stderr, " the first two spaces (the first two bases) and the base after the\n"); - fprintf(stderr, " fourth space (the fifth base).\n"); - fprintf(stderr, " \n"); - fprintf(stderr, " a 'seqlist' is a set of integers formed from any combination\n"); - fprintf(stderr, " of the following, seperated by a comma:\n"); - fprintf(stderr, " num a single number\n"); - fprintf(stderr, " bgn-end a range of numbers: bgn <= end\n"); - fprintf(stderr, " sequences are 1-based; -sequences 1,3-5 will print the first, third,\n"); - fprintf(stderr, " fourth and fifth sequences.\n"); - fprintf(stderr, " \n"); - fprintf(stderr, " a 'namefile' contains the names of sequences to extract, one per line,\n"); - fprintf(stderr, " without any leading '>' or '@'. a name is the first word on the ident line.\n"); - fprintf(stderr, " NOTA BENE: at most one namefile cn be supplied.\n"); - fprintf(stderr, " \n"); - } - - if ((mode == modeUnset) || (mode == modeSimulate)) { - fprintf(stderr, "OPTIONS for simulate mode:\n"); - fprintf(stderr, " -genome G sample reads from these sequences\n"); - fprintf(stderr, " -circular treat the sequences in G as circular\n"); - fprintf(stderr, " -truncate sample uniformly, at expense of read length\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -genomesize g genome size to use for deciding coverage below\n"); - fprintf(stderr, " -coverage c generate approximately c coverage of output\n"); - fprintf(stderr, " -nreads n generate exactly n reads of output\n"); - fprintf(stderr, " -nbases n generate approximately n bases of output\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -distribution F generate read length by sampling the distribution in file F\n"); - fprintf(stderr, " one column - each line is the length of a sequence\n"); - fprintf(stderr, " two columns - each line has the 'length' and 'number of sequences'\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " if file F doesn't exist, use a built-in distribution\n"); - fprintf(stderr, " ultra-long-nanopore\n"); - fprintf(stderr, " pacbio\n"); - fprintf(stderr, " pacbio-hifi\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -length min[-max] generate read length uniformly from range min-max\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -reverse p output a reverse-complement read with probability p\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -output x.fasta (not implemented)\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -test generate a read at every possible start position\n"); - fprintf(stderr, "\n"); - } - - if ((mode == modeUnset) || (mode == modePartition)) { - fprintf(stderr, "OPTIONS for partition mode:\n"); - fprintf(stderr, " -paired treat inputs as paired sequences; the first two files form the\n"); - fprintf(stderr, " first pair, and so on.\n"); - fprintf(stderr, " NOT IMPLEMENTED\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -output O write output sequences to file O. If paired, two files must be supplied.\n"); - fprintf(stderr, " -fasta write output as FASTA\n"); - fprintf(stderr, " -fastq [q] write output as FASTQ; if no quality values, use q (integer, 0-based) for all\n"); - fprintf(stderr, " -writers N use N threads for writing (compressed) output. The last N files will\n"); - fprintf(stderr, " be smaller than requested.\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -coverage C output C coverage of sequences, based on genome size G.\n"); - fprintf(stderr, " -genomesize G \n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -bases B output B bases.\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -reads R output R reads.\n"); - fprintf(stderr, " -pairs P output P pairs (only if -paired).\n"); - fprintf(stderr, "\n"); - } + extPar.showUsage(mode); + genPar.showUsage(mode); + msPar.showUsage(mode); + mutPar.showUsage(mode); + parPar.showUsage(mode); + samPar.showUsage(mode); + srPar.showUsage(mode); + simPar.showUsage(mode); + sumPar.showUsage(mode); - if ((mode == modeUnset) || (mode == modeSample)) { - fprintf(stderr, "OPTIONS for sample mode:\n"); - fprintf(stderr, " -paired treat inputs as paired sequences; the first two files form the\n"); - fprintf(stderr, " first pair, and so on.\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -copies C write C different copies of the sampling (without replacement).\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -output O write output sequences to file O. If paired, two files must be supplied.\n"); - fprintf(stderr, " -fasta write output as FASTA\n"); - fprintf(stderr, " -fastq [q] write output as FASTQ; if no quality values, use q (integer, 0-based) for all\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -coverage C output C coverage of sequences, based on genome size G.\n"); - fprintf(stderr, " -genomesize G \n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -bases B output B bases.\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -reads R output R reads.\n"); - fprintf(stderr, " -pairs P output P pairs (only if -paired).\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -fraction F output fraction F of the input bases.\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -seed s seed the pseudo-random number generate with integer 's'\n"); - fprintf(stderr, "\n"); - } - - if ((mode == modeUnset) || (mode == modeGenerate)) { - fprintf(stderr, "OPTIONS for generate mode:\n"); - fprintf(stderr, " -ident S name reads using printf() pattern in S; default 'random%%08lu'\n"); - fprintf(stderr, " -min M minimum sequence length\n"); - fprintf(stderr, " -max M maximum sequence length\n"); - fprintf(stderr, " -sequences N generate N sequences\n"); - fprintf(stderr, " -bases B generate at least B bases, no more than B+maxLength-1 bases.\n"); - fprintf(stderr, " -gaussian 99.73%% of the reads (3 standard deviations) will be between min and max\n"); - fprintf(stderr, " -mirror F (not implemented; mirror the length distrib of F)\n"); - fprintf(stderr, " -gc bias sets GC/AT composition (default 0.50)\n"); - fprintf(stderr, " -at bias sets GC/AT composition (default 0.50)\n"); - fprintf(stderr, " -a freq sets frequency of A bases (default 0.25)\n"); - fprintf(stderr, " -c freq sets frequency of C bases (default 0.25)\n"); - fprintf(stderr, " -g freq sets frequency of G bases (default 0.25)\n"); - fprintf(stderr, " -t freq sets frequency of T bases (default 0.25)\n"); - fprintf(stderr, " -seed s seed the pseudo-random number generate with integer 's'\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "The -gc option is a shortcut for setting all four base frequencies at once. Order matters!\n"); - fprintf(stderr, " -gc 0.6 -a 0.1 -t 0.3 -- sets G = C = 0.3, A = 0.1, T = 0.3\n"); - fprintf(stderr, " -a 0.1 -t 0.3 -gc 0.6 -- sets G = C = 0.3, A = T = 0.15\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Base frequencies are scaled to sum to 1.0.\n"); - fprintf(stderr, " -a 1.25 -- results in a sum of 2.0 (1.25 + 0.25 + 0.25 + 0.25) so final frequencies will be:\n"); - fprintf(stderr, " A = 1.25/2 = 0.625\n"); - fprintf(stderr, " C = G = T = 0.25/2 = 0.125.\n"); - fprintf(stderr, " -gc 0.8 -a 1.0 -t 0.2 -- sum is also 2.0, final frequencies will be:\n"); - fprintf(stderr, " A = 1.00/2 = 0.5\n"); - fprintf(stderr, " C = G = 0.40/2 = 0.2\n"); - fprintf(stderr, " T = 0.20/2 = 0.1\n"); - fprintf(stderr, "\n"); - } - - if ((mode == modeUnset) || (mode == modeMicroSatellite)) { - fprintf(stderr, "OPTIONS for microsatellite mode:\n"); - fprintf(stderr, " -prefix P write output to

..bed\n"); - fprintf(stderr, " -window w compute in windows of size w; default write output to .bed\n"); - - fprintf(stderr, " -ga compute GA/TC repeat content\n"); - fprintf(stderr, " -gc compute GC repeat content\n"); - fprintf(stderr, " -at compute AT repeat content\n"); - } - - for (uint32 ii=0; ii #include #include using namespace std; +enum opMode { + modeSummarize, // Summarize sequences in FASTA or FASTQ inputs + modeExtract, // Extract sequences or subsequences from FASTA or FASTQ inputs + modeGenerate, // Generate random sequences + modeSimulate, // Simulate reads from FASTA or FASTQ inputs (ontigs/scaffolds/chromosomes) + modePartition, // Rewrite inputs into fixed size outputs + modeSample, // Extract random sequences from FASTA or FASTQ inputs + modeShift, // Generate sequence based on a shift register + modeMicroSatellite, // Find microsatellite repeats + modeMutate, // Randomly mutate bases + modeUnset // Cause an error +}; #define SEQREQUESTER_INCLUDE_H @@ -43,13 +59,12 @@ using namespace std; #undef SEQREQUESTER_INCLUDE_H - -void doSummarize (vector &inputs, summarizeParameters &sumPar); -void doExtract (vector &inputs, extractParameters &extPar); -void doGenerate ( generateParameters &genPar); -void doSimulate (vector &inputs, simulateParameters &simPar); -void doPartition (vector &inputs, partitionParameters &parPar); -void doSample (vector &inputs, sampleParameters &samPar); -void doShiftRegister ( shiftRegisterParameters &srPar); -void doMicroSatellite(vector &inputs, microsatelliteParameters &msPar); -void doMutate (vector &inputs, mutateParameters &mutPar); +void doSummarize (vector &inputs, summarizeParameters &sumPar); +void doExtract (vector &inputs, extractParameters &extPar); +void doGenerate ( generateParameters &genPar); +void doSimulate (vector &inputs, simulateParameters &simPar); +void doPartition (vector &inputs, partitionParameters &parPar); +void doSample (vector &inputs, sampleParameters &samPar); +void doShiftRegister ( shiftRegisterParameters &srPar); +void doMicroSatellite(vector &inputs, microsatelliteParameters &msPar); +void doMutate (vector &inputs, mutateParameters &mutPar); diff --git a/src/seqrequester/shiftregister-emit-fast.C b/src/seqrequester/shiftregister-emit-fast.C index f866caa..4300413 100644 --- a/src/seqrequester/shiftregister-emit-fast.C +++ b/src/seqrequester/shiftregister-emit-fast.C @@ -18,10 +18,10 @@ */ #include "seqrequester.H" + #include "shiftregister-gf4.H" #include "bits.H" - void emitShiftRegisterFast(shiftRegisterParameters &srPar) { // Allocate space for the loop detector, set local variables. diff --git a/src/seqrequester/shiftregister-search-fast.C b/src/seqrequester/shiftregister-search-fast.C index 8ba4cef..886258f 100644 --- a/src/seqrequester/shiftregister-search-fast.C +++ b/src/seqrequester/shiftregister-search-fast.C @@ -18,10 +18,10 @@ */ #include "seqrequester.H" + #include "shiftregister-gf4.H" #include "bits.H" - void searchShiftRegisterFast(shiftRegisterParameters &srPar) { // Allocate space for the loop detector, set local variables. diff --git a/src/seqrequester/shiftregister-search-slow.C b/src/seqrequester/shiftregister-search-slow.C index 6310099..e2d226c 100644 --- a/src/seqrequester/shiftregister-search-slow.C +++ b/src/seqrequester/shiftregister-search-slow.C @@ -18,10 +18,10 @@ */ #include "seqrequester.H" + #include "shiftregister-gf4.H" #include "bits.H" - char srprint[65]; char svprint[65]; diff --git a/src/seqrequester/shiftregister.C b/src/seqrequester/shiftregister.C index 96b715f..a066d58 100644 --- a/src/seqrequester/shiftregister.C +++ b/src/seqrequester/shiftregister.C @@ -18,12 +18,89 @@ */ #include "seqrequester.H" + #include "bits.H" +bool +shiftRegisterParameters::parseOption(opMode &mode, int32 &arg, int32 argc, char **argv) { + + if (strcmp(argv[arg], "shift") == 0) { + mode = modeShift; + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "-order") == 0)) { + order = strtouint32(argv[++arg]); + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "-weight") == 0)) { + weight = strtouint32(argv[++arg]); + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "-emit") == 0)) { + search = false; + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "-length") == 0)) { + length = strtouint64(argv[++arg]); + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "-search") == 0)) { + search = true; + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "-report") == 0)) { + report = strtodouble(argv[++arg]); + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "-fast") == 0)) { + fast = true; + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "-state") == 0)) { // Initial sequence + strcpy(sr, argv[++arg]); // ACGTGGTAA + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "-tap") == 0)) { // SR control bits + strcpy(svmin, argv[++arg]); // 011010011 + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "-tapmin") == 0)) { // SR control bits + strcpy(svmin, argv[++arg]); // 011010011 + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "-tapmax") == 0)) { // SR control bits + strcpy(svmax, argv[++arg]); // 011010011 + } + + else if ((mode == modeShift) && (strcmp(argv[arg], "") == 0)) { + } + + else { + return(false); + } + + return(true); +} + void -shiftRegisterParameters::initialize(void) { +shiftRegisterParameters::showUsage(opMode mode) { + + if (mode != modeShift) + return; + + fprintf(stderr, "No help for mode shift.\n"); +} + + + +bool +shiftRegisterParameters::checkOptions(opMode mode, vector &inputs, vector &errors) { + + if (mode != modeShift) + return(false); + uint32 srLen = strlen(sr); uint32 snLen = strlen(svmin); uint32 sxLen = strlen(svmax); @@ -40,17 +117,18 @@ shiftRegisterParameters::initialize(void) { fail |= ((sxLen > 0) && (snLen != order)); if (fail == true) { - fprintf(stderr, "ERROR: order %u\n", order); - fprintf(stderr, "ERROR: sr %s len %u\n", sr, srLen); - fprintf(stderr, "ERROR: svmin %s len %u\n", svmin, snLen); - fprintf(stderr, "ERROR: svmax %s len %u\n", svmax, sxLen); + sprintf(errors, "ERROR: order %u\n", order); + sprintf(errors, "ERROR: sr %s len %u\n", sr, srLen); + sprintf(errors, "ERROR: svmin %s len %u\n", svmin, snLen); + sprintf(errors, "ERROR: svmax %s len %u\n", svmax, sxLen); } - assert(fail == false); if (srLen > 0) std::reverse(sr, sr + srLen); if (snLen > 0) std::reverse(svmin, svmin + snLen); if (sxLen > 0) std::reverse(svmax, svmax + sxLen); + + return(errors.size() > 0); } @@ -152,8 +230,6 @@ void emitShiftRegisterSlow(shiftRegisterParameters &srPar) { void doShiftRegister(shiftRegisterParameters &srPar) { - srPar.initialize(); - fprintf(stderr, "VERSION 7\n"); if ((srPar.search == true) && (srPar.fast == true)) diff --git a/src/seqrequester/shiftregister.H b/src/seqrequester/shiftregister.H index 4b630c3..8ecba96 100644 --- a/src/seqrequester/shiftregister.H +++ b/src/seqrequester/shiftregister.H @@ -42,16 +42,19 @@ public: ~shiftRegisterParameters() { }; - void initialize(void); + bool parseOption(opMode &mode, int32 &arg, int32 argc, char **argv); + void showUsage(opMode mode); - uint64 getEncodedSR(void); - uint64 getCycleLen(void); - uint64 getCycleMax(void); - uint64 getEncodedSVmin(void); - uint64 getEncodedSVmax(void); - uint64 getEncodedSVmask(void); + bool checkOptions(opMode mode, vector &inputs, vector &errors); - char numberToBase(uint32 number) { + uint64 getEncodedSR(void); + uint64 getCycleLen(void); + uint64 getCycleMax(void); + uint64 getEncodedSVmin(void); + uint64 getEncodedSVmax(void); + uint64 getEncodedSVmask(void); + + char numberToBase(uint32 number) { number = (number << 1) + 'A'; if (number == 'E') diff --git a/src/seqrequester/simulate.C b/src/seqrequester/simulate.C index f6291f4..f451bef 100644 --- a/src/seqrequester/simulate.C +++ b/src/seqrequester/simulate.C @@ -19,12 +19,133 @@ #include "seqrequester.H" -#include "arrays.H" -#include "sequence.H" +bool +simulateParameters::parseOption(opMode &mode, int32 &arg, int32 argc, char **argv) { -#include + if (strcmp(argv[arg], "simulate") == 0) { + mode = modeSimulate; + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-genomesize") == 0)) { + genomeSize = strtouint64(argv[++arg]); + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-coverage") == 0)) { + desiredCoverage = strtodouble(argv[++arg]); + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-nreads") == 0)) { + desiredNumReads = strtouint64(argv[++arg]); + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-nbases") == 0)) { + desiredNumBases = strtouint64(argv[++arg]); + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-circular") == 0)) { + circular = true; + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-truncate") == 0)) { + truncate = true; + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-genome") == 0)) { + genomeName = argv[++arg]; + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-distribution") == 0)) { + distribName = argv[++arg]; + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-length") == 0)) { + decodeRange(argv[++arg], desiredMinLength, desiredMaxLength); + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-reverse") == 0)) { + rcProb = strtodouble(argv[++arg]); + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-seed") == 0)) { + mt.mtSetSeed(strtouint32(argv[++arg])); + } + + else if ((mode == modeSimulate) && (strcmp(argv[arg], "-test") == 0)) { + test = true; + } + + else { + return(false); + } + + return(true); +} + + + +void +simulateParameters::showUsage(opMode mode) { + + if (mode != modeSimulate) + return; + + fprintf(stderr, "OPTIONS for simulate mode:\n"); + fprintf(stderr, " -genome G sample reads from these sequences\n"); + fprintf(stderr, " -circular treat the sequences in G as circular\n"); + fprintf(stderr, " -truncate sample uniformly, at expense of read length\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -genomesize g genome size to use for deciding coverage below\n"); + fprintf(stderr, " -coverage c generate approximately c coverage of output\n"); + fprintf(stderr, " -nreads n generate exactly n reads of output\n"); + fprintf(stderr, " -nbases n generate approximately n bases of output\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -distribution F generate read length by sampling the distribution in file F\n"); + fprintf(stderr, " one column - each line is the length of a sequence\n"); + fprintf(stderr, " two columns - each line has the 'length' and 'number of sequences'\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " if file F doesn't exist, use a built-in distribution\n"); + fprintf(stderr, " ultra-long-nanopore\n"); + fprintf(stderr, " pacbio\n"); + fprintf(stderr, " pacbio-hifi\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -length min[-max] generate read length uniformly from range min-max\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -reverse p output a reverse-complement read with probability p\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -test generate a read at every possible start position\n"); + fprintf(stderr, "\n"); +} + + + +bool +simulateParameters::checkOptions(opMode mode, vector &inputs, vector &errors) { + + if (mode != modeSimulate) + return(false); + + if ((genomeName == nullptr) || + (genomeName[0] == 0)) + sprintf(errors, "ERROR: No reference genome sequence (-genome) supplied.\n"); + + // Load any read length distribution. + + if ((distribName == nullptr) || + (distribName[0] == 0)) { + char const *path = findSharedFile("share/sequence", distribName); + + if ((path == nullptr) || + (fileExists(path) == false)) { + sprintf(errors, "ERROR: File '%s' doesn't exist, and not in any data directory I know about.\n", distribName); + return(true); + } + + dist.loadDistribution(path); + } + + return(errors.size() > 0); +} -using namespace std; class simRead { @@ -62,33 +183,6 @@ public: -void -simulateParameters::finalize() { - - if (distribName[0]) { - char const *path = findSharedFile("share/sequence", distribName); - - if (path == NULL) { - fprintf(stderr, "ERROR: File '%s' doesn't exist, and not in any data directory I know about.\n", distribName); - exit(1); - } - - // If the path is a file -- which it should be -- load it. Else, fail. - - if (fileExists(path) == true) { - fprintf(stderr, "load '%s'\n", path); - dist.loadDistribution(path); - } - - else { - fprintf(stderr, "ERROR: File '%s' doesn't exist, and not in any data directory I know about.\n", distribName); - exit(1); - } - } -} - - - void doSimulate_loadSequences(simulateParameters &simPar, vector &seqs, @@ -429,8 +523,8 @@ doSimulate_test(simulateParameters &simPar, void -doSimulate(vector &inputs, - simulateParameters &simPar) { +doSimulate(vector &inputs, + simulateParameters &simPar) { // Decide how many reads or bases to make. @@ -446,9 +540,6 @@ doSimulate(vector &inputs, if (simPar.desiredNumBases > 0) nBasesMax = simPar.desiredNumBases; - if (simPar.randomSeedValid) - simPar.mt.mtSetSeed(simPar.randomSeed); - // Fail? if ((nBasesMax == uint64max) && diff --git a/src/seqrequester/simulate.H b/src/seqrequester/simulate.H index ad98026..65da0c0 100644 --- a/src/seqrequester/simulate.H +++ b/src/seqrequester/simulate.H @@ -26,20 +26,15 @@ class simulateParameters { public: - simulateParameters() { - } + simulateParameters() {} + ~simulateParameters() {} - ~simulateParameters() { - } - - - void initialize(void) { - }; - - - void finalize(void); + bool parseOption(opMode &mode, int32 &arg, int32 argc, char **argv); + void showUsage(opMode mode); + bool checkOptions(opMode mode, std::vector &inputs, std::vector &errors); +public: uint64 genomeSize = 0; bool circular = false; bool truncate = false; @@ -55,13 +50,9 @@ public: double rcProb = 0.5; - bool randomSeedValid = false; - uint32 randomSeed = 0; - mtRandom mt; sampledDistribution dist; - char genomeName[FILENAME_MAX+1] = { 0 }; - char distribName[FILENAME_MAX+1] = { 0 }; - char outputName[FILENAME_MAX+1] = { 0 }; + char const *genomeName = nullptr; + char const *distribName = nullptr; }; diff --git a/src/seqrequester/summarize.C b/src/seqrequester/summarize.C index 83d1dc4..eaf04a3 100644 --- a/src/seqrequester/summarize.C +++ b/src/seqrequester/summarize.C @@ -19,8 +19,85 @@ #include "seqrequester.H" -#include "arrays.H" -#include "sequence.H" +bool +summarizeParameters::parseOption(opMode &mode, int32 &arg, int32 argc, char **argv) { + + if (strcmp(argv[arg], "summarize") == 0) { + mode = modeSummarize; + } + + else if ((mode == modeSummarize) && (strcmp(argv[arg], "-size") == 0)) { + genomeSize = strtoull(argv[++arg], nullptr, 10); + } + + else if ((mode == modeSummarize) && (strcmp(argv[arg], "-1x") == 0)) { + limitTo1x = true; + } + + else if ((mode == modeSummarize) && (strcmp(argv[arg], "-split-n") == 0)) { + breakAtN = true; + } + + else if ((mode == modeSummarize) && (strcmp(argv[arg], "-simple") == 0)) { + asSimple = true; + } + + else if ((mode == modeSummarize) && (strcmp(argv[arg], "-lengths") == 0)) { + asLength = true; + } + + else if ((mode == modeSummarize) && (strcmp(argv[arg], "-assequences") == 0)) { + asSequences = true; + asBases = false; + } + + else if ((mode == modeSummarize) && (strcmp(argv[arg], "-asbases") == 0)) { + asSequences = false; + asBases = true; + } + + else { + return(false); + } + + return(true); +} + + + +void +summarizeParameters::showUsage(opMode mode) { + + if (mode != modeSummarize) + return; + + fprintf(stderr, "OPTIONS for summarize mode:\n"); + fprintf(stderr, " -size base size to use for N50 statistics\n"); + fprintf(stderr, " -1x limit NG table to 1x coverage\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -split-n split sequences at N bases before computing length\n"); + fprintf(stderr, " -simple output a simple 'length numSequences' histogram\n"); + fprintf(stderr, " -lengths output a list of the sequence lengths\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -assequences load data as complete sequences (for testing)\n"); + fprintf(stderr, " -asbases load data as blocks of bases (for testing)\n"); + fprintf(stderr, "\n"); +} + + + +bool +summarizeParameters::checkOptions(opMode mode, vector &inputs, vector &errors) { + + if (mode != modeSummarize) + return(false); + + if (inputs.size() == 0) + sprintf(errors, "ERROR: No input sequence files supplied.\n"); + + return(errors.size() > 0); +} + bool @@ -323,7 +400,7 @@ doSummarize_lengthHistogram(uint64 *shortLengths, } }; - uint64 ns = 0; + uint64 ns = 1; // Sequences start at 1, not zero! for (uint64 ii=0; ii &inputs, +doSummarize(vector &inputs, summarizeParameters &sumPar) { uint32 shortLengthsLen = 1048576; uint64 *shortLengths = new uint64 [shortLengthsLen]; @@ -396,10 +474,10 @@ doSummarize(vector &inputs, double ntn = 0; uint32 nameMax = 0; - char *name = NULL; + char *name = nullptr; uint64 seqMax = 0; - char *seq = NULL; - uint8 *qlt = NULL; + char *seq = nullptr; + uint8 *qlt = nullptr; uint64 seqLen = 0; for (uint32 ff=0; ff &inputs, std::vector &errors); uint64 genomeSize = 0; diff --git a/src/utility b/src/utility index d49b0a4..43e5690 160000 --- a/src/utility +++ b/src/utility @@ -1 +1 @@ -Subproject commit d49b0a410375938744fedf3c60f2a281506f744e +Subproject commit 43e56908c546a545b0628c1dc7f1b3b35b873e20