-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b0114ac
commit 4cc062f
Showing
8 changed files
with
1,886 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,277 @@ | ||
#ifndef EDLIB_H | ||
#define EDLIB_H | ||
|
||
/** | ||
* @file | ||
* @author Martin Sosic | ||
* @brief Main header file, containing all public functions and structures. | ||
*/ | ||
|
||
// Define EDLIB_API macro to properly export symbols | ||
#ifdef EDLIB_SHARED | ||
# ifdef _WIN32 | ||
# ifdef EDLIB_BUILD | ||
# define EDLIB_API __declspec(dllexport) | ||
# else | ||
# define EDLIB_API __declspec(dllimport) | ||
# endif | ||
# else | ||
# define EDLIB_API __attribute__ ((visibility ("default"))) | ||
# endif | ||
#else | ||
# define EDLIB_API | ||
#endif | ||
|
||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
// Status codes | ||
#define EDLIB_STATUS_OK 0 | ||
#define EDLIB_STATUS_ERROR 1 | ||
|
||
/** | ||
* Alignment methods - how should Edlib treat gaps before and after query? | ||
*/ | ||
typedef enum { | ||
/** | ||
* Global method. This is the standard method. | ||
* Useful when you want to find out how similar is first sequence to second sequence. | ||
*/ | ||
EDLIB_MODE_NW, | ||
/** | ||
* Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized. | ||
* What that means is that deleting elements from the end of second sequence is "free"! | ||
* For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end | ||
* of second sequence is "free" and does not count into total edit distance. This method is appropriate | ||
* when you want to find out how well first sequence fits at the beginning of second sequence. | ||
*/ | ||
EDLIB_MODE_SHW, | ||
/** | ||
* Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are | ||
* not penalized. What that means is that deleting elements from the start and end of second sequence is "free"! | ||
* For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start | ||
* and GAC from the end of second sequence is "free" and does not count into total edit distance. | ||
* This method is appropriate when you want to find out how well first sequence fits at any part of | ||
* second sequence. | ||
* For example, if your second sequence was a long text and your first sequence was a sentence from that text, | ||
* but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in | ||
* that text. In bioinformatics, this method is appropriate for aligning read to a sequence. | ||
*/ | ||
EDLIB_MODE_HW | ||
} EdlibAlignMode; | ||
|
||
/** | ||
* Alignment tasks - what do you want Edlib to do? | ||
*/ | ||
typedef enum { | ||
EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations. | ||
EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations. | ||
EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path. | ||
} EdlibAlignTask; | ||
|
||
/** | ||
* Describes cigar format. | ||
* @see http://samtools.github.io/hts-specs/SAMv1.pdf | ||
* @see http://drive5.com/usearch/manual/cigar.html | ||
*/ | ||
typedef enum { | ||
EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'. | ||
EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'. | ||
} EdlibCigarFormat; | ||
|
||
// Edit operations. | ||
#define EDLIB_EDOP_MATCH 0 //!< Match. | ||
#define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query. | ||
#define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query. | ||
#define EDLIB_EDOP_MISMATCH 3 //!< Mismatch. | ||
|
||
/** | ||
* @brief Defines two given characters as equal. | ||
*/ | ||
typedef struct { | ||
char first; | ||
char second; | ||
} EdlibEqualityPair; | ||
|
||
/** | ||
* @brief Configuration object for edlibAlign() function. | ||
*/ | ||
typedef struct { | ||
/** | ||
* Set k to non-negative value to tell edlib that edit distance is not larger than k. | ||
* Smaller k can significantly improve speed of computation. | ||
* If edit distance is larger than k, edlib will set edit distance to -1. | ||
* Set k to negative value and edlib will internally auto-adjust k until score is found. | ||
*/ | ||
int k; | ||
|
||
/** | ||
* Alignment method. | ||
* EDLIB_MODE_NW: global (Needleman-Wunsch) | ||
* EDLIB_MODE_SHW: prefix. Gap after query is not penalized. | ||
* EDLIB_MODE_HW: infix. Gaps before and after query are not penalized. | ||
*/ | ||
EdlibAlignMode mode; | ||
|
||
/** | ||
* Alignment task - tells Edlib what to calculate. Less to calculate, faster it is. | ||
* EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target. | ||
* EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target. | ||
* EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target). | ||
*/ | ||
EdlibAlignTask task; | ||
|
||
/** | ||
* List of pairs of characters, where each pair defines two characters as equal. | ||
* This way you can extend edlib's definition of equality (which is that each character is equal only | ||
* to itself). | ||
* This can be useful if you have some wildcard characters that should match multiple other characters, | ||
* or e.g. if you want edlib to be case insensitive. | ||
* Can be set to NULL if there are none. | ||
*/ | ||
const EdlibEqualityPair* additionalEqualities; | ||
|
||
/** | ||
* Number of additional equalities, which is non-negative number. | ||
* 0 if there are none. | ||
*/ | ||
int additionalEqualitiesLength; | ||
} EdlibAlignConfig; | ||
|
||
/** | ||
* Helper method for easy construction of configuration object. | ||
* @return Configuration object filled with given parameters. | ||
*/ | ||
EDLIB_API EdlibAlignConfig edlibNewAlignConfig( | ||
int k, EdlibAlignMode mode, EdlibAlignTask task, | ||
const EdlibEqualityPair* additionalEqualities, | ||
int additionalEqualitiesLength | ||
); | ||
|
||
/** | ||
* @return Default configuration object, with following defaults: | ||
* k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities. | ||
*/ | ||
EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void); | ||
|
||
|
||
/** | ||
* Container for results of alignment done by edlibAlign() function. | ||
*/ | ||
typedef struct { | ||
/** | ||
* EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values. | ||
*/ | ||
int status; | ||
|
||
/** | ||
* -1 if k is non-negative and edit distance is larger than k. | ||
*/ | ||
int editDistance; | ||
|
||
/** | ||
* Array of zero-based positions in target where optimal alignment paths end. | ||
* If gap after query is penalized, gap counts as part of query (NW), otherwise not. | ||
* Set to NULL if edit distance is larger than k. | ||
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). | ||
*/ | ||
int* endLocations; | ||
|
||
/** | ||
* Array of zero-based positions in target where optimal alignment paths start, | ||
* they correspond to endLocations. | ||
* If gap before query is penalized, gap counts as part of query (NW), otherwise not. | ||
* Set to NULL if not calculated or if edit distance is larger than k. | ||
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). | ||
*/ | ||
int* startLocations; | ||
|
||
/** | ||
* Number of end (and start) locations. | ||
*/ | ||
int numLocations; | ||
|
||
/** | ||
* Alignment is found for first pair of start and end locations. | ||
* Set to NULL if not calculated. | ||
* Alignment is sequence of numbers: 0, 1, 2, 3. | ||
* 0 stands for match. | ||
* 1 stands for insertion to target. | ||
* 2 stands for insertion to query. | ||
* 3 stands for mismatch. | ||
* Alignment aligns query to target from begining of query till end of query. | ||
* If gaps are not penalized, they are not in alignment. | ||
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). | ||
*/ | ||
unsigned char* alignment; | ||
|
||
/** | ||
* Length of alignment. | ||
*/ | ||
int alignmentLength; | ||
|
||
/** | ||
* Number of different characters in query and target together. | ||
*/ | ||
int alphabetLength; | ||
} EdlibAlignResult; | ||
|
||
/** | ||
* Frees memory in EdlibAlignResult that was allocated by edlib. | ||
* If you do not use it, make sure to free needed members manually using free(). | ||
*/ | ||
EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result); | ||
|
||
|
||
/** | ||
* Aligns two sequences (query and target) using edit distance (levenshtein distance). | ||
* Through config parameter, this function supports different alignment methods (global, prefix, infix), | ||
* as well as different modes of search (tasks). | ||
* It always returns edit distance and end locations of optimal alignment in target. | ||
* It optionally returns start locations of optimal alignment in target and alignment path, | ||
* if you choose appropriate tasks. | ||
* @param [in] query First sequence. | ||
* @param [in] queryLength Number of characters in first sequence. | ||
* @param [in] target Second sequence. | ||
* @param [in] targetLength Number of characters in second sequence. | ||
* @param [in] config Additional alignment parameters, like alignment method and wanted results. | ||
* @return Result of alignment, which can contain edit distance, start and end locations and alignment path. | ||
* Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members. | ||
*/ | ||
EDLIB_API EdlibAlignResult edlibAlign( | ||
const char* query, int queryLength, | ||
const char* target, int targetLength, | ||
const EdlibAlignConfig config | ||
); | ||
|
||
|
||
/** | ||
* Builds cigar string from given alignment sequence. | ||
* @param [in] alignment Alignment sequence. | ||
* 0 stands for match. | ||
* 1 stands for insertion to target. | ||
* 2 stands for insertion to query. | ||
* 3 stands for mismatch. | ||
* @param [in] alignmentLength | ||
* @param [in] cigarFormat Cigar will be returned in specified format. | ||
* @return Cigar string. | ||
* I stands for insertion. | ||
* D stands for deletion. | ||
* X stands for mismatch. (used only in extended format) | ||
* = stands for match. (used only in extended format) | ||
* M stands for (mis)match. (used only in standard format) | ||
* String is null terminated. | ||
* Needed memory is allocated and given pointer is set to it. | ||
* Do not forget to free it later using free()! | ||
*/ | ||
EDLIB_API char* edlibAlignmentToCigar( | ||
const unsigned char* alignment, int alignmentLength, | ||
EdlibCigarFormat cigarFormat | ||
); | ||
|
||
#ifdef __cplusplus | ||
} | ||
#endif | ||
|
||
#endif // EDLIB_H |
Oops, something went wrong.