From 8e2865fe1aca65726d816c81a1c4aac285d0a44c Mon Sep 17 00:00:00 2001 From: asistradition Date: Sun, 16 Aug 2020 14:31:54 -0400 Subject: [PATCH 01/20] Fix overlaps & improve output: --- srrTomat0/ATACBedTomat0.py | 24 ++++++++++++------- srrTomat0/processor/prior.py | 46 +++++++++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 19 deletions(-) diff --git a/srrTomat0/ATACBedTomat0.py b/srrTomat0/ATACBedTomat0.py index 25db0f2..4ddd56e 100644 --- a/srrTomat0/ATACBedTomat0.py +++ b/srrTomat0/ATACBedTomat0.py @@ -4,6 +4,8 @@ from srrTomat0.motifs import motifs_to_dataframe, INFO_COL, MOTIF_NAME_COL import argparse +import os +import pathlib import pandas as pd @@ -13,7 +15,7 @@ def main(): ap.add_argument("-a", "--atac", dest="atac", help="ATAC BED file", metavar="FILE", default=None) ap.add_argument("-f", "--fasta", dest="fasta", help="Genomic FASTA file", metavar="FILE", required=True) ap.add_argument("-g", "--gtf", dest="annotation", help="GTF Annotation File", metavar="PATH", required=True) - ap.add_argument("-o", "--out", dest="out", help="Output PATH", metavar="PATH", required=True) + ap.add_argument("-o", "--out", dest="out", help="Output PATH prefix", metavar="PATH", required=True) ap.add_argument("-w", "--window", dest="window_size", help="Window around genes", type=int, default=0, nargs="+") ap.add_argument("-c", "--cpu", dest="cores", help="Number of cores", metavar="CORES", type=int, default=1) ap.add_argument("--tss", dest="tss", help="Use TSS for window", action='store_const', const=True, default=False) @@ -25,16 +27,22 @@ def main(): ap.add_argument("--threshold", nargs="+", default=None, type=str) args = ap.parse_args() + out_prefix = os.path.abspath(os.path.expanduser(args.out)) + out_path = os.path.join(*pathlib.PurePath(out_prefix).parts[:-1]) + if not os.path.exists(out_path): + os.makedirs(out_prefix) if args.threshold is None: - prior_edges, prior_matrix, raw_matrix = build_atac_motif_prior(args.motif, args.atac, args.annotation, args.fasta, - window_size=args.window_size, num_cores=args.cores, + prior_edges, prior_matrix, raw_matrix = build_atac_motif_prior(args.motif, args.atac, args.annotation, + args.fasta, + window_size=args.window_size, + num_cores=args.cores, use_tss=args.tss, motif_ic=args.min_ic, scaner_type=args.scanner) - prior_matrix.astype(int).to_csv(args.out, sep="\t") - prior_edges.to_csv(args.out + ".edges.tsv.gz", sep="\t") - raw_matrix.to_csv(args.out + ".raw.tsv", sep="\t") + prior_matrix.astype(int).to_csv(out_prefix + "_edge_matrix.tsv.gz", sep="\t") + prior_edges.to_csv(out_prefix + "_edge_table.tsv.gz", sep="\t") + raw_matrix.to_csv(out_prefix + "_unfiltered_matrix.tsv.gz", sep="\t") else: motifs = MotifScan.load_motif_file(args.motif) motif_information = motifs_to_dataframe(motifs) @@ -55,12 +63,12 @@ def main(): edge_count = pd.concat(edge_count, axis=1) edge_count = edge_count.join(motif_information[INFO_COL]) - edge_count.to_csv(args.out, sep="\t") + edge_count.to_csv(out_prefix + "_edge_count.tsv", sep="\t") def build_atac_motif_prior(motif_meme_file, atac_bed_file, annotation_file, genomic_fasta_file, window_size=0, use_tss=True, scaner_type='fimo', num_cores=1, motif_ic=6, tandem=100, - truncate_motifs=0.35, scanner_thresh="5e-4"): + truncate_motifs=0.35, scanner_thresh="1e-4"): # Set the scanner type if scaner_type.lower() == 'fimo': MotifScan.set_type_fimo() diff --git a/srrTomat0/processor/prior.py b/srrTomat0/processor/prior.py index d6c34a5..f44cb84 100644 --- a/srrTomat0/processor/prior.py +++ b/srrTomat0/processor/prior.py @@ -3,10 +3,10 @@ from srrTomat0.motifs import INFO_COL, MOTIF_COL, LEN_COL, SCAN_SCORE_COL, MOTIF_NAME_COL import pandas as pd +import pandas.api.types as pat import numpy as np import pathos.multiprocessing as multiprocessing from sklearn.cluster import DBSCAN -from collections import Counter PRIOR_TF = 'regulator' @@ -66,12 +66,22 @@ def score_tf(cls, tf_motifs): if n_sites == 0: return None + # Sort and check for overlapping motifs + tf_motifs = tf_motifs.sort_values(by=MotifScan.start_col) + overlap = tf_motifs[MotifScan.start_col] < tf_motifs[MotifScan.stop_col].shift() + + # Collapse together any overlapping motifs to the maximum score + if overlap.any(): + tf_motifs["group_up"] = (~overlap).cumsum() + tf_motifs = tf_motifs.groupby("group_up").agg({MotifScan.start_col: "min", + MotifScan.stop_col: "max", + SCAN_SCORE_COL: "max"}) + n_sites = tf_motifs.shape[0] + # If there's only one site check it and then return if n_sites == 1: return cls._top_hit(tf_motifs) - tf_motifs = tf_motifs.sort_values(by=MotifScan.start_col) - # If there's only two sites check it and then return if n_sites == 2: consider_tandem = tf_motifs.iloc[0, :][MotifScan.stop_col] - tf_motifs.iloc[1, :][MotifScan.start_col] @@ -146,7 +156,7 @@ def _first_value(series): return series -def build_prior_from_atac_motifs(genes, motif_peaks, motif_information, num_workers=1, seed=42): +def build_prior_from_atac_motifs(genes, motif_peaks, motif_information, num_workers=1, seed=42, min_filter=0.005): """ Construct a prior [G x K] interaction matrix :param genes: pd.DataFrame [G x n] @@ -196,25 +206,23 @@ def _prior_mapper(data): np.random.seed(seed) - target_size = int(0.005 * genes.shape[0]) thresholded_data = [] # Threshold using DBSCAN outlier detection for reg in prior_data[PRIOR_TF].unique(): reg_edge = prior_data.loc[prior_data[PRIOR_TF] == reg, :] - if reg_edge.shape[0] > target_size: - reg_edge = reg_edge.loc[_find_outliers_dbscan(reg_edge), :] + reg_edge = reg_edge.loc[_find_outliers_dbscan(reg_edge), :] thresholded_data.append(reg_edge.copy()) + # Pivot to a matrix, extend to all TFs, and fill with 1s + raw_matrix = prior_data.pivot(index=PRIOR_GENE, columns=PRIOR_TF, values=PRIOR_SCORE) + raw_matrix = raw_matrix.reindex(motif_names, axis=1).reindex(genes[GTF_GENENAME], axis=0).fillna(0) + thresholded_data = pd.concat(thresholded_data).reset_index(drop=True) # Pivot to a matrix, extend to all TFs, and fill with 1s prior_matrix = thresholded_data.pivot(index=PRIOR_GENE, columns=PRIOR_TF, values=PRIOR_SCORE) prior_matrix = prior_matrix.reindex(motif_names, axis=1).reindex(genes[GTF_GENENAME], axis=0).fillna(0) - # Pivot to a matrix, extend to all TFs, and fill with 1s - raw_matrix = prior_data.pivot(index=PRIOR_GENE, columns=PRIOR_TF, values=PRIOR_SCORE) - raw_matrix = raw_matrix.reindex(motif_names, axis=1).reindex(genes[GTF_GENENAME], axis=0).fillna(0) - return thresholded_data, prior_matrix, raw_matrix @@ -250,6 +258,22 @@ def _find_outliers_dbscan(tf_data): return keep_edge +def _find_outliers_stability(tf_data, steps=100, threshold=0.01): + scores = tf_data[PRIOR_SCORE].values.reshape(-1, 1) + smin, smax = scores.min(), scores.max() + + step_size = (smax - smin) / steps + step_cuts = np.array([smin + step_size * i for i in range(steps)]) + stabs = np.array([np.mean([scores <= s]) for s in step_cuts]) + diffs = pd.Series(np.roll(stabs, -1)[:-1] - stabs[:-1]).cummax().values + + threshold_steps = step_cuts[1:][diffs < threshold] + selected_score_cut = np.max(threshold_steps) if len(threshold_steps) > 0 else step_cuts[0] + print(tf_data[PRIOR_TF].iloc[0], end=": ") + print(selected_score_cut) + return scores >= selected_score_cut + + def _build_prior_for_gene(gene_info, motif_data, motif_information, num_iteration): """ Takes ATAC peaks and Motif locations near a single gene and turns them into TF-gene scores From 6b00c40256e73fa6172bb8f0cc05920886961ad1 Mon Sep 17 00:00:00 2001 From: asistradition Date: Sun, 23 Aug 2020 18:38:03 -0400 Subject: [PATCH 02/20] TRANSFAC --- srrTomat0/motifs/__init__.py | 3 +- srrTomat0/motifs/_motif.py | 19 +++++++-- srrTomat0/motifs/fimo.py | 5 ++- srrTomat0/motifs/homer.py | 5 ++- srrTomat0/motifs/transfac.py | 78 ++++++++++++++++++++++++++++++++++++ 5 files changed, 103 insertions(+), 7 deletions(-) create mode 100644 srrTomat0/motifs/transfac.py diff --git a/srrTomat0/motifs/__init__.py b/srrTomat0/motifs/__init__.py index e9c876b..57996db 100644 --- a/srrTomat0/motifs/__init__.py +++ b/srrTomat0/motifs/__init__.py @@ -1,3 +1,4 @@ from srrTomat0.motifs._motif import (Motif, motifs_to_dataframe, chunk_motifs, - INFO_COL, MOTIF_COL, ENTROPY_COL, LEN_COL, OCC_COL, MOTIF_NAME_COL, SCAN_SCORE_COL) + INFO_COL, MOTIF_COL, ENTROPY_COL, LEN_COL, OCC_COL, MOTIF_NAME_COL, SCAN_SCORE_COL, + SCORE_PER_BASE) from srrTomat0.motifs.motif_scan import MotifScan diff --git a/srrTomat0/motifs/_motif.py b/srrTomat0/motifs/_motif.py index b3eea6e..a4102b8 100644 --- a/srrTomat0/motifs/_motif.py +++ b/srrTomat0/motifs/_motif.py @@ -18,6 +18,7 @@ MOTIF_NAME_COL = "Motif_Name" SCAN_SCORE_COL = "Tomat0_Score" +SCORE_PER_BASE = "Per Base Array" class Motif: @@ -29,6 +30,7 @@ class Motif: _motif_prob_array = None _motif_alphabet = None _motif_background = None + _motif_species = None _alphabet_map = None _consensus_seq = None _info_matrix = None @@ -125,6 +127,15 @@ def threshold_ln_odds(self): second_prob = np.sort(self.probability_matrix, axis=1)[:, 2] return self.max_ln_odds - max((np.sum(np.log(second_prob[second_prob > 0.25] / 0.25)), 0.1 * self.max_ln_odds)) + @property + def species(self): + return self._motif_species + + @species.setter + def species(self, new_species): + self._motif_species = [] if self._motif_species is None else self._motif_species + self._motif_species.append(new_species) + def __len__(self): return self.probability_matrix.shape[0] if self.probability_matrix is not None else 0 @@ -161,10 +172,10 @@ def score_match(self, match, disallow_homopolymer=True, homopolymer_one_off_len= return 0 # Score anything with excessive nucleotides that have a p ~ 0.0 as 0 - if score_zero_as_zero is not None and sum(p < 0.001 for p in self.__prob_match(match)) > score_zero_as_zero: + if score_zero_as_zero is not None and sum(p < 0.001 for p in self._prob_match(match)) > score_zero_as_zero: return 0 - mse_ic = np.sum(np.square(np.subtract(self.__info_match(self.consensus), self.__info_match(match)))) + mse_ic = np.sum(np.square(np.subtract(self._info_match(self.consensus), self._info_match(match)))) return max((self.information_content - mse_ic, 0)) def truncate(self, threshold=0.35): @@ -173,10 +184,10 @@ def truncate(self, threshold=0.35): self.probability_matrix = self.probability_matrix[keepers, :] self._motif_probs = list(itertools.compress(self._motif_probs, keepers)) - def __prob_match(self, match): + def _prob_match(self, match): return [self.probability_matrix[i, self._alphabet_map[ch.lower()]] for i, ch in enumerate(match)] - def __info_match(self, match): + def _info_match(self, match): return [self.ic_matrix[i, self._alphabet_map[ch.lower()]] for i, ch in enumerate(match)] diff --git a/srrTomat0/motifs/fimo.py b/srrTomat0/motifs/fimo.py index b888548..7620265 100644 --- a/srrTomat0/motifs/fimo.py +++ b/srrTomat0/motifs/fimo.py @@ -1,10 +1,11 @@ import io import subprocess import pandas as pd +import numpy as np import pandas.errors as pde from srrTomat0 import FIMO_EXECUTABLE_PATH -from srrTomat0.motifs import meme, chunk_motifs, SCAN_SCORE_COL +from srrTomat0.motifs import meme, chunk_motifs, SCAN_SCORE_COL, SCORE_PER_BASE from srrTomat0.motifs._motif import __MotifScanner FIMO_DATA_SUFFIX = ".fimo.tsv" @@ -59,6 +60,8 @@ def _parse_output(self, output_handle): motifs[SCAN_SCORE_COL] = [self.motifs[x].score_match(y) for x, y in zip(motifs[FIMO_MOTIF], motifs[FIMO_SEQUENCE])] + motifs[SCORE_PER_BASE] = [np.array(self.motifs[x]._info_match(y)) for x, y in + zip(motifs[FIMO_MOTIF], motifs[FIMO_SEQUENCE])] return motifs except pde.EmptyDataError: diff --git a/srrTomat0/motifs/homer.py b/srrTomat0/motifs/homer.py index 4e409aa..156d100 100644 --- a/srrTomat0/motifs/homer.py +++ b/srrTomat0/motifs/homer.py @@ -1,8 +1,9 @@ import subprocess import io import pandas as pd +import numpy as np -from srrTomat0.motifs import chunk_motifs, homer_motif, SCAN_SCORE_COL +from srrTomat0.motifs import chunk_motifs, homer_motif, SCAN_SCORE_COL, SCORE_PER_BASE from srrTomat0.motifs._motif import __MotifScanner from srrTomat0 import HOMER_EXECUTABLE_PATH @@ -63,5 +64,7 @@ def _parse_output(self, output_handle): motifs[SCAN_SCORE_COL] = [self.motifs[x].score_match(y) for x, y in zip(motifs[HOMER_MOTIF], motifs[HOMER_MATCH])] + motifs[SCORE_PER_BASE] = [np.array(self.motifs[x]._info_match(y)) for x, y in + zip(motifs[HOMER_MOTIF], motifs[HOMER_MATCH])] return motifs diff --git a/srrTomat0/motifs/transfac.py b/srrTomat0/motifs/transfac.py new file mode 100644 index 0000000..d3f74ec --- /dev/null +++ b/srrTomat0/motifs/transfac.py @@ -0,0 +1,78 @@ +from srrTomat0.motifs import Motif + +import numpy as np + +TRANSFAC_CODES = {"AC": "Accession", + "ID": "ID", + "NA": "Name", + "DT": "Date", + "CO": "Copyright", + "DE": "Description", + "TY": "Type", + "OS": "", + "OL": "", + "BF": "Species", + "P0": "Alphabet", + "SR": "", + "BA": "", + "CC": "", + "PR": "Profile"} + + +def read(file_descript): + + # Parse if it's a string + if isinstance(file_descript, str): + with open(file_descript) as motif_fh: + return [m for m in _parse_transfac_file(motif_fh)] + + # Parse if it's a file handle + else: + return [m for m in _parse_transfac_file(file_descript)] + + +def _parse_transfac_file(transfac_fh): + return [m for m in __parse_motif_gen(transfac_fh)] + + +def __parse_motif_gen(handle): + + active_motif = None + active_ac, active_id, active_species, active_name = None, None, None, [] + + for line in handle: + line = line.strip().lower() + + # Spacer + if line.startswith("XX"): + continue + + # New record + elif line.startswith("//") and active_motif is not None: + yield active_motif + active_ac, active_id, active_alphabet, active_species = None, None, None, [] + + # Accession + elif line.startswith("AC"): + active_ac = line[2:].strip() + + # ID + elif line.startswith("ID"): + active_id = line[2:].strip() + + # Name + elif line.startswith("NA"): + active_name = line[2:].strip() + + # Alphabet + elif line.startswith("P0"): + active_motif = Motif(active_ac, active_name, line[2:].strip().split()) + + # Prob + elif line[:2].isdigit(): + probs = line[2:].strip().split()[:-1] + total_seqs = sum(line) + active_motif.add_prob_line(list(map(lambda x: float(x) / total_seqs, probs))) + + if active_motif is not None: + yield active_motif From f4d2c8d220dacd292b774a4fad0ddcc693eae392 Mon Sep 17 00:00:00 2001 From: asistradition Date: Sun, 23 Aug 2020 18:38:29 -0400 Subject: [PATCH 03/20] Better overlap handling --- srrTomat0/processor/prior.py | 83 +++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/srrTomat0/processor/prior.py b/srrTomat0/processor/prior.py index f44cb84..19c1702 100644 --- a/srrTomat0/processor/prior.py +++ b/srrTomat0/processor/prior.py @@ -1,6 +1,6 @@ from srrTomat0.processor.gtf import GTF_GENENAME, GTF_CHROMOSOME, SEQ_START, SEQ_STOP from srrTomat0.motifs.motif_scan import MotifScan -from srrTomat0.motifs import INFO_COL, MOTIF_COL, LEN_COL, SCAN_SCORE_COL, MOTIF_NAME_COL +from srrTomat0.motifs import INFO_COL, MOTIF_COL, LEN_COL, SCAN_SCORE_COL, MOTIF_NAME_COL, SCORE_PER_BASE import pandas as pd import pandas.api.types as pat @@ -8,7 +8,6 @@ import pathos.multiprocessing as multiprocessing from sklearn.cluster import DBSCAN - PRIOR_TF = 'regulator' PRIOR_GENE = 'target' PRIOR_COUNT = 'count' @@ -50,8 +49,6 @@ def score_tf(cls, tf_motifs): Score a single TF :param tf_motifs: Motif binding sites from FIMO/HOMER :type tf_motifs: pd.DataFrame - :param motif_len: Length of the motif recognition site - :type motif_len: int :return: Score if the TF should be kept, None otherwise """ @@ -72,10 +69,16 @@ def score_tf(cls, tf_motifs): # Collapse together any overlapping motifs to the maximum score if overlap.any(): - tf_motifs["group_up"] = (~overlap).cumsum() - tf_motifs = tf_motifs.groupby("group_up").agg({MotifScan.start_col: "min", - MotifScan.stop_col: "max", - SCAN_SCORE_COL: "max"}) + + if (tf_motifs[MOTIF_NAME_COL] == "GAL4").any(): + print(tf_motifs.sort_values(by=MotifScan.start_col)) + + tf_motifs["GROUP"] = (~overlap).cumsum() + tf_motifs = pd.concat([cls._agg_per_base(group) for _, group in tf_motifs.groupby("GROUP")]) + + if (tf_motifs[MOTIF_NAME_COL] == "GAL4").any(): + print(tf_motifs.sort_values(by=MotifScan.start_col)) + n_sites = tf_motifs.shape[0] # If there's only one site check it and then return @@ -155,8 +158,29 @@ def _first_value(series): except AttributeError: return series + @classmethod + def _agg_per_base(cls, overlap_df): + """ + Aggregate an overlapping set of motif peaks by summing the maximum per-base IC for each base + :param overlap_df: + :return: + """ + if len(overlap_df) == 1: + return overlap_df[[MotifScan.start_col, MotifScan.stop_col, SCAN_SCORE_COL, MOTIF_NAME_COL]] + + new_df = [] + for i in overlap_df.index: + new_df.extend([(a, b) for a, b in zip(range(overlap_df.loc[i, MotifScan.start_col], + overlap_df.loc[i, MotifScan.stop_col]), + overlap_df.loc[i, SCORE_PER_BASE])]) + + return pd.DataFrame({MotifScan.start_col: [overlap_df[MotifScan.start_col].min()], + MotifScan.stop_col: [overlap_df[MotifScan.stop_col].max()], + SCAN_SCORE_COL: pd.DataFrame(new_df, columns=["B", "S"]).groupby("B").agg('max').sum(), + MOTIF_NAME_COL: [overlap_df[MOTIF_NAME_COL].unique()[0]]}) + -def build_prior_from_atac_motifs(genes, motif_peaks, motif_information, num_workers=1, seed=42, min_filter=0.005): +def build_prior_from_atac_motifs(genes, motif_peaks, motif_information, num_workers=1, seed=42): """ Construct a prior [G x K] interaction matrix :param genes: pd.DataFrame [G x n] @@ -176,7 +200,7 @@ def build_prior_from_atac_motifs(genes, motif_peaks, motif_information, num_work # Trim down the motif dataframe and put it into a dict by chromosome motif_peaks = motif_peaks.reindex([MotifScan.name_col, MotifScan.chromosome_col, MotifScan.start_col, - MotifScan.stop_col, SCAN_SCORE_COL], axis=1) + MotifScan.stop_col, SCAN_SCORE_COL, SCORE_PER_BASE], axis=1) motif_id_to_name = motif_information.reindex([MOTIF_COL, MOTIF_NAME_COL], axis=1) invalid_names = (pd.isnull(motif_id_to_name[MOTIF_NAME_COL]) | @@ -240,40 +264,29 @@ def _gene_gen(genes, motif_peaks): continue -def _find_outliers_dbscan(tf_data): +def _find_outliers_dbscan(tf_data, t_1=0.01, t_2=0.05): scores = tf_data[PRIOR_SCORE].values.reshape(-1, 1) counts = tf_data.shape[0] - labels = DBSCAN(min_samples=np.sqrt(counts), eps=1).fit_predict(scores) - outlier_labels = labels == -1 + labels = DBSCAN(min_samples=np.log2(counts), eps=scores.max() / 100).fit_predict(scores) - mean_score = np.mean(scores) - keep_edge = pd.Series(outlier_labels & (tf_data[PRIOR_SCORE].values > mean_score), index=tf_data.index) + # Keep any outliers (outliers near 0 should be discarded) + keep_edge = pd.Series((labels == -1) & (tf_data[PRIOR_SCORE].values > np.mean(scores)), index=tf_data.index) - # Check the highest non-outlier cluster to see if it's worth including - lbl_idx = labels == (labels[scores[~keep_edge.values].argmax()]) - if (np.min(scores[lbl_idx]) > mean_score) and (np.sum(lbl_idx) < (2 * np.sum(outlier_labels))): - keep_edge |= lbl_idx + # Iterate through clusters in reverse order until at least t_1 and no more than t_2 edges are included + for lab in np.unique(labels)[::-1]: + current_ratio = keep_edge.sum() / keep_edge.size + new_labels = labels == lab + if current_ratio > t_1: + break + elif current_ratio + (new_labels.sum() / new_labels.size) > t_2: + break + else: + keep_edge |= new_labels return keep_edge -def _find_outliers_stability(tf_data, steps=100, threshold=0.01): - scores = tf_data[PRIOR_SCORE].values.reshape(-1, 1) - smin, smax = scores.min(), scores.max() - - step_size = (smax - smin) / steps - step_cuts = np.array([smin + step_size * i for i in range(steps)]) - stabs = np.array([np.mean([scores <= s]) for s in step_cuts]) - diffs = pd.Series(np.roll(stabs, -1)[:-1] - stabs[:-1]).cummax().values - - threshold_steps = step_cuts[1:][diffs < threshold] - selected_score_cut = np.max(threshold_steps) if len(threshold_steps) > 0 else step_cuts[0] - print(tf_data[PRIOR_TF].iloc[0], end=": ") - print(selected_score_cut) - return scores >= selected_score_cut - - def _build_prior_for_gene(gene_info, motif_data, motif_information, num_iteration): """ Takes ATAC peaks and Motif locations near a single gene and turns them into TF-gene scores From ad8c69b3251bc62dff2016b91a8fc071d62a57a4 Mon Sep 17 00:00:00 2001 From: asistradition Date: Sat, 12 Sep 2020 14:01:50 -0400 Subject: [PATCH 04/20] TRANSFAC parsing --- srrTomat0/motifs/_motif.py | 63 ++++++++++++++++++++++++++++++++---- srrTomat0/motifs/transfac.py | 49 +++++++++++++++++----------- 2 files changed, 86 insertions(+), 26 deletions(-) diff --git a/srrTomat0/motifs/_motif.py b/srrTomat0/motifs/_motif.py index a4102b8..33b5ef9 100644 --- a/srrTomat0/motifs/_motif.py +++ b/srrTomat0/motifs/_motif.py @@ -31,6 +31,7 @@ class Motif: _motif_alphabet = None _motif_background = None _motif_species = None + _motif_accession = None _alphabet_map = None _consensus_seq = None _info_matrix = None @@ -40,6 +41,39 @@ class Motif: def alphabet(self): return self._motif_alphabet + @alphabet.setter + def alphabet(self, new_alphabet): + if new_alphabet is not None: + self._motif_alphabet = new_alphabet + self._alphabet_map = {ch.lower(): i for i, ch in enumerate(self._motif_alphabet)} + + @property + def accession(self): + return self._motif_accession + + @accession.setter + def accession(self, new_accession): + if new_accession is not None: + self._motif_accession = new_accession + + @property + def id(self): + return self.motif_id + + @id.setter + def id(self, new_id): + if new_id is not None: + self.motif_id = new_id + + @property + def name(self): + return self.motif_name + + @name.setter + def name(self, new_name): + if new_name is not None: + self.motif_name = new_name + @property def alphabet_len(self): return len(self._motif_alphabet) @@ -133,8 +167,16 @@ def species(self): @species.setter def species(self, new_species): - self._motif_species = [] if self._motif_species is None else self._motif_species - self._motif_species.append(new_species) + is_list = isinstance(new_species, (list, tuple)) + + if is_list and self._motif_species is None: + self._motif_species = new_species + elif is_list: + self._motif_species.extend(new_species) + elif self._motif_species is None: + self._motif_species = [new_species] + else: + self._motif_species.append(new_species) def __len__(self): return self.probability_matrix.shape[0] if self.probability_matrix is not None else 0 @@ -145,11 +187,10 @@ def __str__(self): el=len(self), ic=self.information_content) - def __init__(self, motif_id, motif_name, motif_alphabet, motif_background=None): - self.motif_id = motif_id - self.motif_name = motif_name - self._motif_alphabet = motif_alphabet - self._alphabet_map = {ch.lower(): i for i, ch in enumerate(self._motif_alphabet)} + def __init__(self, motif_id=None, motif_name=None, motif_alphabet=None, motif_background=None): + self.id = motif_id + self.name = motif_name + self.alphabet = motif_alphabet self._motif_background = motif_background self._motif_probs = [] @@ -190,6 +231,14 @@ def _prob_match(self, match): def _info_match(self, match): return [self.ic_matrix[i, self._alphabet_map[ch.lower()]] for i, ch in enumerate(match)] + def species_contains(self, match_str): + if self.species is not None: + match_str = match_str.lower() + return any(match_str in s.lower() for s in self.species) + else: + return False + + class __MotifScanner: diff --git a/srrTomat0/motifs/transfac.py b/srrTomat0/motifs/transfac.py index d3f74ec..8cf4737 100644 --- a/srrTomat0/motifs/transfac.py +++ b/srrTomat0/motifs/transfac.py @@ -37,42 +37,53 @@ def _parse_transfac_file(transfac_fh): def __parse_motif_gen(handle): - active_motif = None - active_ac, active_id, active_species, active_name = None, None, None, [] + active_motif = Motif() for line in handle: - line = line.strip().lower() + line = line.strip() + + if len(line) < 2: + continue + + line_id, line = line[:2].upper(), line[2:].strip() # Spacer - if line.startswith("XX"): + if line_id == "XX": continue # New record - elif line.startswith("//") and active_motif is not None: + elif line_id == "//" and len(active_motif) > 0: yield active_motif - active_ac, active_id, active_alphabet, active_species = None, None, None, [] + active_motif = Motif() + + elif line_id == "//": + active_motif = Motif() # Accession - elif line.startswith("AC"): - active_ac = line[2:].strip() + elif line_id == "AC": + active_motif.accession = line # ID - elif line.startswith("ID"): - active_id = line[2:].strip() + elif line_id == "ID": + active_motif.motif_id = line # Name - elif line.startswith("NA"): - active_name = line[2:].strip() + elif line_id == "NA": + active_motif.motif_name = line # Alphabet - elif line.startswith("P0"): - active_motif = Motif(active_ac, active_name, line[2:].strip().split()) + elif line_id == "P0": + active_motif.alphabet = line.split() + + elif line_id == "BF": + active_motif.species = line # Prob - elif line[:2].isdigit(): - probs = line[2:].strip().split()[:-1] - total_seqs = sum(line) - active_motif.add_prob_line(list(map(lambda x: float(x) / total_seqs, probs))) + elif line_id.isdigit(): + probs = list(map(float, line.split()[:-1])) + total_seqs = sum(probs) + active_motif.add_prob_line(list(map(lambda x: x / total_seqs, probs))) - if active_motif is not None: + if len(active_motif) > 0: yield active_motif + From e764e8c4c40eb6f807debf32f0eef29f3750fc99 Mon Sep 17 00:00:00 2001 From: asistradition Date: Mon, 14 Sep 2020 12:40:52 -0400 Subject: [PATCH 05/20] Allow different motif formats --- srrTomat0/ATACBedTomat0.py | 39 +++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/srrTomat0/ATACBedTomat0.py b/srrTomat0/ATACBedTomat0.py index 4ddd56e..4213991 100644 --- a/srrTomat0/ATACBedTomat0.py +++ b/srrTomat0/ATACBedTomat0.py @@ -11,7 +11,9 @@ def main(): ap = argparse.ArgumentParser(description="Create a prior from open chromatin peaks and motif peaks") - ap.add_argument("-m", "--motif", dest="motif", help="Motif MEME file", metavar="PATH", required=True) + ap.add_argument("-m", "--motif", dest="motif", help="Motif file", metavar="PATH", required=True) + ap.add_argument("--motif_format", dest="motif_format", help="Motif file FORMAT (transfac or meme)", + metavar="FORMAT", default="meme") ap.add_argument("-a", "--atac", dest="atac", help="ATAC BED file", metavar="FILE", default=None) ap.add_argument("-f", "--fasta", dest="fasta", help="Genomic FASTA file", metavar="FILE", required=True) ap.add_argument("-g", "--gtf", dest="annotation", help="GTF Annotation File", metavar="PATH", required=True) @@ -38,7 +40,8 @@ def main(): window_size=args.window_size, num_cores=args.cores, use_tss=args.tss, motif_ic=args.min_ic, - scaner_type=args.scanner) + scanner_type=args.scanner, + motif_format=args.motif_format) prior_matrix.astype(int).to_csv(out_prefix + "_edge_matrix.tsv.gz", sep="\t") prior_edges.to_csv(out_prefix + "_edge_table.tsv.gz", sep="\t") @@ -55,8 +58,9 @@ def main(): window_size=args.window_size, num_cores=args.cores, use_tss=args.tss, motif_ic=args.min_ic, - scaner_type=args.scanner, - scanner_thresh=t) + scanner_type=args.scanner, + scanner_thresh=t, + motif_format=args.motif_format) edge_count[t] = (raw_matrix != 0).sum(axis=0) @@ -66,13 +70,13 @@ def main(): edge_count.to_csv(out_prefix + "_edge_count.tsv", sep="\t") -def build_atac_motif_prior(motif_meme_file, atac_bed_file, annotation_file, genomic_fasta_file, window_size=0, - use_tss=True, scaner_type='fimo', num_cores=1, motif_ic=6, tandem=100, - truncate_motifs=0.35, scanner_thresh="1e-4"): +def build_atac_motif_prior(motif_file, atac_bed_file, annotation_file, genomic_fasta_file, window_size=0, + use_tss=True, scanner_type='fimo', num_cores=1, motif_ic=6, tandem=100, + truncate_motifs=0.35, scanner_thresh="1e-4", motif_format="meme"): # Set the scanner type - if scaner_type.lower() == 'fimo': + if scanner_type.lower() == 'fimo': MotifScan.set_type_fimo() - elif scaner_type.lower() == 'homer': + elif scanner_type.lower() == 'homer': MotifScan.set_type_homer() else: raise ValueError("motif_type must be fimo or homer") @@ -84,13 +88,22 @@ def build_atac_motif_prior(motif_meme_file, atac_bed_file, annotation_file, geno genes = load_gtf_to_dataframe(annotation_file) print("\t{n} genes loaded".format(n=genes.shape[0])) - genes = open_window(genes, window_size=window_size, use_tss=use_tss) + genes = open_window(genes, window_size=window_size, use_tss=use_tss, check_against_fasta=genomic_fasta_file) print("\tPromoter regions defined with window {w}".format(w=window_size)) # PROCESS MOTIF PWMS # - print("Loading motifs from file ({f})".format(f=motif_meme_file)) - motifs = MotifScan.load_motif_file(motif_meme_file) + print("Loading motifs from file ({f})".format(f=motif_file)) + if motif_format.lower() == "meme": + from srrTomat0.motifs.meme import read + elif motif_format.lower() == "transfac": + from srrTomat0.motifs.transfac import read + elif motif_format.lower() == "homer": + from srrTomat0.motifs.homer_motif import read + else: + raise ValueError("motif_format must be 'meme', 'homer', or 'transfac'") + + motifs = read(motif_file) motif_information = motifs_to_dataframe(motifs) print("\t{n} motifs loaded".format(n=len(motif_information))) @@ -100,7 +113,7 @@ def build_atac_motif_prior(motif_meme_file, atac_bed_file, annotation_file, geno # SCAN CHROMATIN FOR MOTIFS # # Load and scan target chromatin peaks - print("Scanning target chromatin ({f_c}) for motifs ({f_m})".format(f_c=atac_bed_file, f_m=motif_meme_file)) + print("Scanning target chromatin ({f_c}) for motifs ({f_m})".format(f_c=atac_bed_file, f_m=motif_file)) gene_locs = genes.loc[:, [GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND]].copy() gene_locs[[SEQ_START, SEQ_STOP]] = gene_locs[[SEQ_START, SEQ_STOP]].astype(int) From ccf1d3afcde83e5b47658b9f2d8c251cc71046d8 Mon Sep 17 00:00:00 2001 From: asistradition Date: Mon, 14 Sep 2020 12:41:04 -0400 Subject: [PATCH 06/20] Save count matrix --- srrTomat0/motifs/_motif.py | 12 +++++++++++- srrTomat0/motifs/transfac.py | 7 ++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/srrTomat0/motifs/_motif.py b/srrTomat0/motifs/_motif.py index 33b5ef9..f71ba43 100644 --- a/srrTomat0/motifs/_motif.py +++ b/srrTomat0/motifs/_motif.py @@ -27,6 +27,7 @@ class Motif: motif_url = None _motif_probs = None + _motif_counts = None _motif_prob_array = None _motif_alphabet = None _motif_background = None @@ -96,6 +97,10 @@ def probability_matrix(self): def probability_matrix(self, matrix): self._motif_prob_array = matrix + @property + def count_matrix(self): + return np.array(self._motif_counts) if self._motif_counts is not None else None + @property def shannon_entropy(self): @@ -197,6 +202,12 @@ def __init__(self, motif_id=None, motif_name=None, motif_alphabet=None, motif_ba def add_prob_line(self, line): self._motif_probs.append(line) + def add_count_line(self, line): + if self._motif_counts is not None: + self._motif_counts.append(line) + else: + self._motif_counts = [line] + def score_match(self, match, disallow_homopolymer=True, homopolymer_one_off_len=6, score_zero_as_zero=1): if len(match) != len(self): @@ -239,7 +250,6 @@ def species_contains(self, match_str): return False - class __MotifScanner: def __init__(self, motif_file=None, motifs=None, num_workers=4): diff --git a/srrTomat0/motifs/transfac.py b/srrTomat0/motifs/transfac.py index 8cf4737..9afbc86 100644 --- a/srrTomat0/motifs/transfac.py +++ b/srrTomat0/motifs/transfac.py @@ -80,9 +80,10 @@ def __parse_motif_gen(handle): # Prob elif line_id.isdigit(): - probs = list(map(float, line.split()[:-1])) - total_seqs = sum(probs) - active_motif.add_prob_line(list(map(lambda x: x / total_seqs, probs))) + counts = list(map(float, line.split()[:-1])) + active_motif.add_count_line(counts) + total_seqs = sum(counts) + active_motif.add_prob_line(list(map(lambda x: x / total_seqs, counts))) if len(active_motif) > 0: yield active_motif From ffd8233c151d963ff308c3b5bf2a7c6ea957d5cb Mon Sep 17 00:00:00 2001 From: asistradition Date: Mon, 14 Sep 2020 12:41:18 -0400 Subject: [PATCH 07/20] Fix overrunning chromosome --- srrTomat0/processor/gtf.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/srrTomat0/processor/gtf.py b/srrTomat0/processor/gtf.py index 8cacd82..7eff48e 100644 --- a/srrTomat0/processor/gtf.py +++ b/srrTomat0/processor/gtf.py @@ -39,12 +39,13 @@ def load_gtf_to_dataframe(gtf_path): return _add_TSS(annotations) -def open_window(annotation_dataframe, window_size, use_tss=False): +def open_window(annotation_dataframe, window_size, use_tss=False, check_against_fasta=None): """ This needs to adjust the start and stop in the annotation dataframe with window sizes :param annotation_dataframe: pd.DataFrame :param window_size: int :param use_tss: bool + :param check_against_fasta: :return window_annotate: pd.DataFrame """ window_annotate = annotation_dataframe.copy() @@ -72,6 +73,23 @@ def open_window(annotation_dataframe, window_size, use_tss=False): window_annotate.loc[window_annotate[SEQ_START] < 0, SEQ_START] = 0 + if check_against_fasta is not None: + fasta_len = {} + with open(check_against_fasta, mode="r") as fasta_fh: + current_record = None + for line in fasta_fh: + if line.startswith(">"): + current_record = line[1:].split()[0] + fasta_len[current_record] = 0 + else: + fasta_len[current_record] += len(line.strip()) + + for chromosome in window_annotate[GTF_CHROMOSOME].unique(): + _chrlen = fasta_len[chromosome] + _idx = window_annotate[GTF_CHROMOSOME] == chromosome + window_annotate.loc[_idx & (window_annotate[SEQ_STOP] > _chrlen), SEQ_STOP] = _chrlen + window_annotate.loc[_idx & (window_annotate[SEQ_START] > _chrlen), SEQ_START] = _chrlen + return window_annotate From 3b934f7097be15ff0d08d652805eb34ad5fa3817 Mon Sep 17 00:00:00 2001 From: asistradition Date: Mon, 14 Sep 2020 12:41:32 -0400 Subject: [PATCH 08/20] Gaussian outlier detection --- srrTomat0/processor/prior.py | 78 ++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/srrTomat0/processor/prior.py b/srrTomat0/processor/prior.py index 19c1702..95f2add 100644 --- a/srrTomat0/processor/prior.py +++ b/srrTomat0/processor/prior.py @@ -7,6 +7,7 @@ import numpy as np import pathos.multiprocessing as multiprocessing from sklearn.cluster import DBSCAN +from sklearn.covariance import EllipticEnvelope PRIOR_TF = 'regulator' PRIOR_GENE = 'target' @@ -67,18 +68,12 @@ def score_tf(cls, tf_motifs): tf_motifs = tf_motifs.sort_values(by=MotifScan.start_col) overlap = tf_motifs[MotifScan.start_col] < tf_motifs[MotifScan.stop_col].shift() - # Collapse together any overlapping motifs to the maximum score + # Collapse together any overlapping motifs to the maximum score on a per-base basis if overlap.any(): - if (tf_motifs[MOTIF_NAME_COL] == "GAL4").any(): - print(tf_motifs.sort_values(by=MotifScan.start_col)) - tf_motifs["GROUP"] = (~overlap).cumsum() tf_motifs = pd.concat([cls._agg_per_base(group) for _, group in tf_motifs.groupby("GROUP")]) - if (tf_motifs[MOTIF_NAME_COL] == "GAL4").any(): - print(tf_motifs.sort_values(by=MotifScan.start_col)) - n_sites = tf_motifs.shape[0] # If there's only one site check it and then return @@ -230,22 +225,21 @@ def _prior_mapper(data): np.random.seed(seed) - thresholded_data = [] - # Threshold using DBSCAN outlier detection - for reg in prior_data[PRIOR_TF].unique(): - reg_edge = prior_data.loc[prior_data[PRIOR_TF] == reg, :] - reg_edge = reg_edge.loc[_find_outliers_dbscan(reg_edge), :] - thresholded_data.append(reg_edge.copy()) - - # Pivot to a matrix, extend to all TFs, and fill with 1s + # Pivot to a matrix, extend to all TFs, and fill with 0s raw_matrix = prior_data.pivot(index=PRIOR_GENE, columns=PRIOR_TF, values=PRIOR_SCORE) raw_matrix = raw_matrix.reindex(motif_names, axis=1).reindex(genes[GTF_GENENAME], axis=0).fillna(0) + raw_matrix.index.name = PRIOR_GENE - thresholded_data = pd.concat(thresholded_data).reset_index(drop=True) + prior_matrix = raw_matrix.copy() + # Threshold using DBSCAN outlier detection + for reg in prior_matrix.columns: + prior_matrix.loc[~_find_outliers_elliptic_envelope(prior_matrix[reg]), reg] = 0. - # Pivot to a matrix, extend to all TFs, and fill with 1s - prior_matrix = thresholded_data.pivot(index=PRIOR_GENE, columns=PRIOR_TF, values=PRIOR_SCORE) - prior_matrix = prior_matrix.reindex(motif_names, axis=1).reindex(genes[GTF_GENENAME], axis=0).fillna(0) + # Keep the peaks that we want + thresholded_data = prior_matrix.reset_index().melt(id_vars=PRIOR_GENE, var_name=PRIOR_TF, value_name='T') + thresholded_data = prior_data.merge(thresholded_data, on=[PRIOR_GENE, PRIOR_TF]) + thresholded_data = thresholded_data.loc[thresholded_data['T'] != 0, :] + thresholded_data.drop('T', axis=1, inplace=True) return thresholded_data, prior_matrix, raw_matrix @@ -265,13 +259,13 @@ def _gene_gen(genes, motif_peaks): def _find_outliers_dbscan(tf_data, t_1=0.01, t_2=0.05): - scores = tf_data[PRIOR_SCORE].values.reshape(-1, 1) + scores = tf_data.values.reshape(-1, 1) counts = tf_data.shape[0] labels = DBSCAN(min_samples=np.log2(counts), eps=scores.max() / 100).fit_predict(scores) # Keep any outliers (outliers near 0 should be discarded) - keep_edge = pd.Series((labels == -1) & (tf_data[PRIOR_SCORE].values > np.mean(scores)), index=tf_data.index) + keep_edge = pd.Series((labels == -1) & (tf_data.values > np.mean(scores)), index=tf_data.index) # Iterate through clusters in reverse order until at least t_1 and no more than t_2 edges are included for lab in np.unique(labels)[::-1]: @@ -287,22 +281,44 @@ def _find_outliers_dbscan(tf_data, t_1=0.01, t_2=0.05): return keep_edge +def _find_outliers_elliptic_envelope(tf_data, target=0.01): + + scores = tf_data.values + keep_genes = pd.Series(False, index=tf_data.index) + + if np.var(scores) == 0.: + return keep_genes + + try: + labels = EllipticEnvelope(contamination=target, support_fraction=1).fit_predict(scores.reshape(-1,1)) + except ValueError as _err: + return keep_genes + + keep_genes |= ((labels == -1) & (scores > np.mean(scores))) + return keep_genes + + def _build_prior_for_gene(gene_info, motif_data, motif_information, num_iteration): """ - Takes ATAC peaks and Motif locations near a single gene and turns them into TF-gene scores - - :param gene_data: (str, pd.DataFrame, int, pd.DataFrame) - Unpacks to gene_name, motif_data, num_iteration, motif_data - gene_name: str identifier for the gene - chromatin_data: pd.DataFrame which has the ATAC (open chromatin) peaks near the gene - motif_data: pd.DataFrame which has the Motif locations near the gene - num_iteration: int the number of genes which have been processed + Takes motifs identified by scan near a single gene and turns them into TF-gene scores + + :param gene_info: Gene information from annotations + :type gene_info: pd.DataFrame + :param motif_data: Motif locations near the gene + :type motif_data: pd.DataFrame + :param motif_information: Motif information + :type motif_information: pd.DataFrame + :param num_iteration: Number of genes which have been processed + :type num_iteration: int :return prior_edges: pd.DataFrame [N x 5] 'regulator': tf name 'target': gene name 'count': number of motifs found - 'score': negative log10 of p-value - 'pvalue': p-value calculated using poisson survival function + 'score': information content-based score of binding site + 'motif_ic': information content score of motif + 'start': binding site start + 'stop': binding site stop + 'chromosome' binding site chromosome """ gene_name = gene_info[GTF_GENENAME] From 4b6a080e7484afee7691ee3ddf1f0ec71f2ee13e Mon Sep 17 00:00:00 2001 From: asistradition Date: Mon, 14 Sep 2020 15:48:45 -0400 Subject: [PATCH 09/20] 0.025 --- srrTomat0/processor/prior.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/srrTomat0/processor/prior.py b/srrTomat0/processor/prior.py index 95f2add..6d5e84d 100644 --- a/srrTomat0/processor/prior.py +++ b/srrTomat0/processor/prior.py @@ -281,7 +281,7 @@ def _find_outliers_dbscan(tf_data, t_1=0.01, t_2=0.05): return keep_edge -def _find_outliers_elliptic_envelope(tf_data, target=0.01): +def _find_outliers_elliptic_envelope(tf_data, target=0.025): scores = tf_data.values keep_genes = pd.Series(False, index=tf_data.index) From 70d998ea0eb333d823c09508f53edd3c19cd19e8 Mon Sep 17 00:00:00 2001 From: asistradition Date: Mon, 14 Sep 2020 16:36:03 -0400 Subject: [PATCH 10/20] Short circuit --- srrTomat0/processor/prior.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/srrTomat0/processor/prior.py b/srrTomat0/processor/prior.py index 6d5e84d..332e0ee 100644 --- a/srrTomat0/processor/prior.py +++ b/srrTomat0/processor/prior.py @@ -163,11 +163,11 @@ def _agg_per_base(cls, overlap_df): if len(overlap_df) == 1: return overlap_df[[MotifScan.start_col, MotifScan.stop_col, SCAN_SCORE_COL, MOTIF_NAME_COL]] - new_df = [] - for i in overlap_df.index: - new_df.extend([(a, b) for a, b in zip(range(overlap_df.loc[i, MotifScan.start_col], - overlap_df.loc[i, MotifScan.stop_col]), - overlap_df.loc[i, SCORE_PER_BASE])]) + overlap_df.reset_index(inplace=True) + + new_df = [(a, b) for i in overlap_df.index for a, b in zip(range(overlap_df.loc[i, MotifScan.start_col], + overlap_df.loc[i, MotifScan.stop_col]), + overlap_df.loc[i, SCORE_PER_BASE])] return pd.DataFrame({MotifScan.start_col: [overlap_df[MotifScan.start_col].min()], MotifScan.stop_col: [overlap_df[MotifScan.stop_col].max()], @@ -281,7 +281,7 @@ def _find_outliers_dbscan(tf_data, t_1=0.01, t_2=0.05): return keep_edge -def _find_outliers_elliptic_envelope(tf_data, target=0.025): +def _find_outliers_elliptic_envelope(tf_data, target=0.02): scores = tf_data.values keep_genes = pd.Series(False, index=tf_data.index) @@ -289,6 +289,9 @@ def _find_outliers_elliptic_envelope(tf_data, target=0.025): if np.var(scores) == 0.: return keep_genes + if np.sum(scores > 0) / scores.size * 2 < target: + return keep_genes | (scores > 0) + try: labels = EllipticEnvelope(contamination=target, support_fraction=1).fit_predict(scores.reshape(-1,1)) except ValueError as _err: From 8a3ec27acdc88d22bcb6646304e8fbfae4827f01 Mon Sep 17 00:00:00 2001 From: asistradition Date: Tue, 15 Sep 2020 18:54:00 -0400 Subject: [PATCH 11/20] mahalanobis distance based elliptic --- srrTomat0/processor/prior.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/srrTomat0/processor/prior.py b/srrTomat0/processor/prior.py index 332e0ee..6cc8898 100644 --- a/srrTomat0/processor/prior.py +++ b/srrTomat0/processor/prior.py @@ -8,6 +8,7 @@ import pathos.multiprocessing as multiprocessing from sklearn.cluster import DBSCAN from sklearn.covariance import EllipticEnvelope +from scipy.ndimage.filters import uniform_filter1d PRIOR_TF = 'regulator' PRIOR_GENE = 'target' @@ -281,7 +282,7 @@ def _find_outliers_dbscan(tf_data, t_1=0.01, t_2=0.05): return keep_edge -def _find_outliers_elliptic_envelope(tf_data, target=0.02): +def _find_outliers_elliptic_envelope(tf_data, outlier=2.5, skip_threshold=0.002): scores = tf_data.values keep_genes = pd.Series(False, index=tf_data.index) @@ -289,15 +290,26 @@ def _find_outliers_elliptic_envelope(tf_data, target=0.02): if np.var(scores) == 0.: return keep_genes - if np.sum(scores > 0) / scores.size * 2 < target: - return keep_genes | (scores > 0) + _nz_idx = scores > 0 - try: - labels = EllipticEnvelope(contamination=target, support_fraction=1).fit_predict(scores.reshape(-1,1)) - except ValueError as _err: + if skip_threshold is not None and np.sum(_nz_idx) < (len(keep_genes) * skip_threshold): + return keep_genes | _nz_idx + + # Calculate Mahalanobis distance + _nzs = scores[_nz_idx].reshape(-1, 1) + + if np.var(_nzs) == 0.: return keep_genes - keep_genes |= ((labels == -1) & (scores > np.mean(scores))) + # Correct for just the dumbest bug in scipy + if (np.mean(_nzs) == np.median(_nzs)) & (np.sum(_nzs == np.median(_nzs)) > (0.5 * _nzs.size)): + keep_genes[_nz_idx] = (_nzs >= np.median(_nzs)).flatten() + return keep_genes + + m_dist = EllipticEnvelope(support_fraction=1).fit(_nzs).score_samples(_nzs) + scaled_m_dist = (m_dist - np.mean(m_dist)) / np.std(m_dist) + keep_genes[_nz_idx] = scaled_m_dist < (-1 * outlier) + return keep_genes From 6cee80f9d817f5c0a887094508c3f6e327520a22 Mon Sep 17 00:00:00 2001 From: asistradition Date: Wed, 23 Sep 2020 12:10:38 -0400 Subject: [PATCH 12/20] Force chromosomes to strings and add warning about mismatching names --- srrTomat0/processor/gtf.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/srrTomat0/processor/gtf.py b/srrTomat0/processor/gtf.py index 7eff48e..81c3657 100644 --- a/srrTomat0/processor/gtf.py +++ b/srrTomat0/processor/gtf.py @@ -1,3 +1,4 @@ +import warnings import pybedtools import pandas as pd @@ -36,6 +37,10 @@ def load_gtf_to_dataframe(gtf_path): # Define genes as going from the minimum start for any subfeature to the maximum end for any subfeature annotations = _fix_genes(annotations) + + # Fix chromosome names to always be strings + annotations[GTF_CHROMOSOME] = annotations[GTF_CHROMOSOME].astype(str) + return _add_TSS(annotations) @@ -84,7 +89,22 @@ def open_window(annotation_dataframe, window_size, use_tss=False, check_against_ else: fasta_len[current_record] += len(line.strip()) - for chromosome in window_annotate[GTF_CHROMOSOME].unique(): + _gtf_chromosomes = set(window_annotate[GTF_CHROMOSOME].unique()) + _fasta_chromsomes = set(fasta_len.keys()) + _gtf_fasta_match = _gtf_chromosomes.intersection(_fasta_chromsomes) + + if len(_gtf_fasta_match) != len(_gtf_chromosomes): + _msg = "GTF File Chromosomes {g} do not match FASTA File Chromosomes {f}\n" + _msg += "The following chromosomes will not map correctly: {ft}" + _msg = _msg.format(g=_gtf_chromosomes, + f=_fasta_chromsomes, + ft=_gtf_chromosomes.symmetric_difference(_fasta_chromsomes)) + warnings.warn(_msg) + + if len(_gtf_fasta_match) == 0: + raise ValueError("Unable to map FASTA and GTF chromosomes together") + + for chromosome in _gtf_fasta_match: _chrlen = fasta_len[chromosome] _idx = window_annotate[GTF_CHROMOSOME] == chromosome window_annotate.loc[_idx & (window_annotate[SEQ_STOP] > _chrlen), SEQ_STOP] = _chrlen From c4ea6ad22855b87de9588429f3ecbf3f95063f25 Mon Sep 17 00:00:00 2001 From: asistradition Date: Sat, 26 Sep 2020 14:04:10 -0400 Subject: [PATCH 13/20] Rename to inferelator-prior --- inferelator_prior/ATACTomat0.py | 87 + inferelator_prior/ChIPBedTomat0.py | 145 ++ inferelator_prior/RNASeqTomat0.py | 134 ++ inferelator_prior/__init__.py | 25 + inferelator_prior/mkref.py | 32 + inferelator_prior/motifs/__init__.py | 4 + inferelator_prior/motifs/_motif.py | 385 +++++ inferelator_prior/motifs/fimo.py | 68 + inferelator_prior/motifs/homer.py | 70 + inferelator_prior/motifs/homer_motif.py | 71 + inferelator_prior/motifs/meme.py | 179 +++ inferelator_prior/motifs/motif_scan.py | 49 + inferelator_prior/motifs/pwm.py | 46 + inferelator_prior/motifs/transfac.py | 90 ++ inferelator_prior/network_from_motifs.py | 140 ++ inferelator_prior/processor/__init__.py | 0 inferelator_prior/processor/bedtools.py | 123 ++ inferelator_prior/processor/chroma.py | 13 + inferelator_prior/processor/gtf.py | 144 ++ inferelator_prior/processor/htseq_count.py | 88 + inferelator_prior/processor/matrix.py | 176 ++ inferelator_prior/processor/prior.py | 382 +++++ inferelator_prior/processor/samtools.py | 83 + inferelator_prior/processor/srr.py | 173 ++ inferelator_prior/processor/star.py | 218 +++ inferelator_prior/processor/utils.py | 142 ++ inferelator_prior/pwm_to_meme.py | 34 + inferelator_prior/tests/__init__.py | 0 .../tests/artifacts/M00799_2.00.txt | 10 + .../tests/artifacts/sc64_mito.fasta | 1431 +++++++++++++++++ .../tests/artifacts/sc64_mito.gtf | 317 ++++ inferelator_prior/tests/artifacts/test.meme | 23 + inferelator_prior/tests/artifacts/test.motif | 10 + .../tests/artifacts/test_alignment.fastq.gz | Bin 0 -> 9201 bytes .../tests/artifacts/test_ecori.meme | 26 + .../tests/artifacts/test_motif_search.bed | 2 + .../tests/artifacts/test_motif_search.fasta | 10 + .../artifacts/test_motif_search.fasta.fai | 1 + inferelator_prior/tests/test_motif.py | 114 ++ inferelator_prior/tests/test_motif_scan.py | 88 + inferelator_prior/tests/test_srr.py | 50 + inferelator_prior/tests/test_star.py | 72 + inferelator_prior/tests/test_utils.py | 55 + inferelator_prior/velocity/__init__.py | 0 inferelator_prior/velocity/calc.py | 79 + 45 files changed, 5389 insertions(+) create mode 100644 inferelator_prior/ATACTomat0.py create mode 100755 inferelator_prior/ChIPBedTomat0.py create mode 100644 inferelator_prior/RNASeqTomat0.py create mode 100644 inferelator_prior/__init__.py create mode 100644 inferelator_prior/mkref.py create mode 100644 inferelator_prior/motifs/__init__.py create mode 100644 inferelator_prior/motifs/_motif.py create mode 100644 inferelator_prior/motifs/fimo.py create mode 100644 inferelator_prior/motifs/homer.py create mode 100644 inferelator_prior/motifs/homer_motif.py create mode 100644 inferelator_prior/motifs/meme.py create mode 100644 inferelator_prior/motifs/motif_scan.py create mode 100644 inferelator_prior/motifs/pwm.py create mode 100644 inferelator_prior/motifs/transfac.py create mode 100644 inferelator_prior/network_from_motifs.py create mode 100644 inferelator_prior/processor/__init__.py create mode 100644 inferelator_prior/processor/bedtools.py create mode 100644 inferelator_prior/processor/chroma.py create mode 100644 inferelator_prior/processor/gtf.py create mode 100644 inferelator_prior/processor/htseq_count.py create mode 100644 inferelator_prior/processor/matrix.py create mode 100644 inferelator_prior/processor/prior.py create mode 100644 inferelator_prior/processor/samtools.py create mode 100644 inferelator_prior/processor/srr.py create mode 100644 inferelator_prior/processor/star.py create mode 100644 inferelator_prior/processor/utils.py create mode 100644 inferelator_prior/pwm_to_meme.py create mode 100644 inferelator_prior/tests/__init__.py create mode 100644 inferelator_prior/tests/artifacts/M00799_2.00.txt create mode 100644 inferelator_prior/tests/artifacts/sc64_mito.fasta create mode 100644 inferelator_prior/tests/artifacts/sc64_mito.gtf create mode 100644 inferelator_prior/tests/artifacts/test.meme create mode 100644 inferelator_prior/tests/artifacts/test.motif create mode 100644 inferelator_prior/tests/artifacts/test_alignment.fastq.gz create mode 100644 inferelator_prior/tests/artifacts/test_ecori.meme create mode 100644 inferelator_prior/tests/artifacts/test_motif_search.bed create mode 100644 inferelator_prior/tests/artifacts/test_motif_search.fasta create mode 100644 inferelator_prior/tests/artifacts/test_motif_search.fasta.fai create mode 100644 inferelator_prior/tests/test_motif.py create mode 100644 inferelator_prior/tests/test_motif_scan.py create mode 100644 inferelator_prior/tests/test_srr.py create mode 100644 inferelator_prior/tests/test_star.py create mode 100644 inferelator_prior/tests/test_utils.py create mode 100644 inferelator_prior/velocity/__init__.py create mode 100644 inferelator_prior/velocity/calc.py diff --git a/inferelator_prior/ATACTomat0.py b/inferelator_prior/ATACTomat0.py new file mode 100644 index 0000000..8ec53d5 --- /dev/null +++ b/inferelator_prior/ATACTomat0.py @@ -0,0 +1,87 @@ +import argparse +import os + +import pandas as pd + +from inferelator_prior.processor.utils import file_path_abs +from inferelator_prior.processor.srr import get_srr_files, unpack_srr_files +from inferelator_prior.processor.star import star_align_fastqs +from inferelator_prior.processor.samtools import sam_sort + +from inferelator_prior import SRR_SUBPATH, FASTQ_SUBPATH, STAR_ALIGNMENT_SUBPATH, BAM_SUBPATH + + +OUTPUT_MATRIX_FILE_NAME = "atac_matrix.tsv" + +COUNT_FILE_METAINDEXES = ["N_unmapped", "N_multimapping", "N_noFeature", "N_ambiguous"] +COUNT_FILE_HEADER = ["Total", "MinusStrand", "PlusStrand"] +COUNT_FILE_HEADER_FOR_OUTPUT = "Total" + + +def main(): + ap = argparse.ArgumentParser(description="Turn ATAC-seq expression SRRs from NCBI GEO into a prior matrix") + ap.add_argument("-s", "--srr", dest="srr", help="SRR record IDs", nargs="+", metavar="SRRID", default=None) + ap.add_argument("-f", "--file", dest="file", help="List of SRR records in a TXT file", metavar="FILE", default=None) + ap.add_argument("-g", "--genome", dest="genome", help="STAR reference genome", metavar="PATH", required=True) + ap.add_argument("-o", "--out", dest="out", help="Output PATH", metavar="PATH", required=True) + ap.add_argument("--gzip", dest="gzip", help="GZIP output file", action='store_const', const=True, default=False) + + args = ap.parse_args() + srr_ids = list() + + if args.srr is None and args.file is None: + print("One of --srr or --file must be set") + exit(1) + elif args.srr is not None and args.file is not None: + print("Only one of --srr or --file may be set (not both)") + exit(1) + elif args.srr is not None: + # SRR IDs are provided at command line + srr_ids = args.srr + elif args.file is not None: + # SRR IDs are in a .txt file; read them into a list + srr_ids = pd.read_csv(args.file, sep="\t", index_col=None, header=None).iloc[:, 0].tolist() + else: + raise ValueError("There is something wrong with this switch") + + atac_tomat0(srr_ids, args.out, args.genome, gzip_output=args.gzip) + + +def atac_tomat0(srr_ids, output_path, star_reference_genome, gzip_output=False, cores=4, star_jobs=2, star_args=None, + min_quality=None): + + star_args = [] if star_args is None else star_args + + output_path = file_path_abs(output_path) + os.makedirs(output_path, exist_ok=True) + + # Download all the SRR files + print("Downloading SRR files") + os.makedirs(os.path.join(output_path, SRR_SUBPATH), exist_ok=True) + srr_file_names = get_srr_files(srr_ids, os.path.join(output_path, SRR_SUBPATH), num_workers=cores) + + # Unpack all the SRR files into FASTQ files + print("Unpacking SRR files") + os.makedirs(os.path.join(output_path, FASTQ_SUBPATH), exist_ok=True) + fastq_file_names = unpack_srr_files(srr_ids, srr_file_names, os.path.join(output_path, FASTQ_SUBPATH), + num_workers=cores) + + # Run all the FASTQ files through STAR to align + print("Aligning FASTQ files") + os.makedirs(os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), exist_ok=True) + thread_count = max(int(cores / len(srr_ids)), int(cores / star_jobs)) + sam_file_names = star_align_fastqs(srr_ids, fastq_file_names, star_reference_genome, + os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), + num_workers=star_jobs, threads_per_worker=thread_count, star_options=star_args) + + # Sort all the SAM files into BAM files + print("Sorting SAM files into BAM files") + os.makedirs(os.path.join(output_path, BAM_SUBPATH), exist_ok=True) + bam_file_names = sam_sort(srr_ids, sam_file_names, os.path.join(output_path, BAM_SUBPATH), min_quality=min_quality, + num_workers=cores) + + +if __name__ == '__main__': + main() + + diff --git a/inferelator_prior/ChIPBedTomat0.py b/inferelator_prior/ChIPBedTomat0.py new file mode 100755 index 0000000..0e900d3 --- /dev/null +++ b/inferelator_prior/ChIPBedTomat0.py @@ -0,0 +1,145 @@ +import argparse +import pandas as pd + +import pybedtools + +from inferelator_prior.processor.gtf import load_gtf_to_dataframe, SEQ_START, SEQ_STOP, GTF_GENENAME, SEQ_TSS +from inferelator_prior.processor.utils import file_path_abs +from inferelator_prior.processor.bedtools import get_peaks_in_features + +# Column names +BED_CHROMOSOME = 'chrom' +SEQ_COUNTS = 'count' +SEQ_BIN = 'bin' + +# Quantiles for bin +PEAK_QUANTILES = [0.25, 0.5, 0.75, 1] + + +def main(): + ap = argparse.ArgumentParser(description="Load peaks and genes.") + ap.add_argument("-f", "--file", dest="file", help="TSV file with ID|BED PATH pairs", metavar="FILE", default=None) + ap.add_argument("-b", "--bed", dest="bed", help="BED file containing ChIP peaks", nargs="+", metavar="FILE", + default=None) + ap.add_argument("-a", "--annotation", dest="anno", help="GTF/GFF Annotation File", metavar="FILE", required=True) + ap.add_argument("-o", "--out", dest="out", help="Output TSV PATH", metavar="PATH", required=True) + ap.add_argument("-w", "--window", dest="window_size", help="Window size", type=int, default=0) + ap.add_argument("-g", "--genebody", dest="gene", help="Gene body", action="store_const", const=True, default=False) + ap.add_argument("-t", "--tss", dest="tss", help="Transcription start site", action="store_const", const=True, + default=False) + args = ap.parse_args() + + if args.bed is None and args.file is None: + print("One of --bed or --file must be set") + exit(1) + elif args.bed is not None and args.file is not None: + print("Only one of --bed or --file may be set (not both)") + exit(1) + elif args.bed is not None: + # SRR IDs are provided at command line + id_names = args.bed + chip_bed_files = args.bed + elif args.file is not None: + # SRR IDs are in a .txt file; read them into a list + chip_samples = pd.read_csv(args.file, sep="\t", index_col=None, header=None) + if chip_samples.shape[1] != 2: + print("The TSV file must have two columns: ID and File_Path") + id_names = chip_samples.iloc[:, 0].tolist() + chip_bed_files = chip_samples.iloc[:, 1].tolist() + else: + raise ValueError("There is something wrong with this switch") + + if args.gene is False and args.tss is False: + print("One of --genebody or --tss must be set") + exit(1) + elif args.gene is not False and args.tss is not False: + print("Only one of --genebody or --tss may be set (not both)") + exit(1) + + chip_bed_tomat0(id_names, chip_bed_files, args.anno, output_path=args.out, window_size=args.window_size, + gene_body_flag=args.gene, tss_flag=args.tss) + + +def chip_bed_tomat0(id_names, chip_peaks_file, annotation_file, output_path=None, window_size=0, gene_body_flag = False, + tss_flag = False): + """ + Process a BED file of peaks into a integer peak-count matrix + :param chip_peaks_file: list(str) + List of paths to a BED file + :param output_path: str + Path to the output TSV file + :param annotation_file: str + Path to the GTF annotation file + :param window_size: int + Window on each side of a gene to include a peak in the count + 100 means 100bp up from start and 100bp down from end + :return gene_counts: pd.DataFrame + Integer count matrix of peaks per gene + """ + + # Convert paths to absolutes + output_path = file_path_abs(output_path) + annotation_file = file_path_abs(annotation_file) + + # Load annotations into a dataframe with pybedtools + # Adjust the start and stop positions to account for a flanking window + genes = load_gtf_to_dataframe(annotation_file) + + if gene_body_flag: + genes = open_window(genes, window_size) + if tss_flag: + genes = open_tss(genes, window_size) + + + prior_data = pd.DataFrame(index=genes[GTF_GENENAME]) + for id_name, peak_file in zip(id_names, chip_peaks_file): + # Load BED file into a dataframe with pybedtools + peak_file = file_path_abs(peak_file) + chip_peaks = pybedtools.BedTool(peak_file).to_dataframe() + gene_counts = get_peaks_in_features(genes, chip_peaks) + + # Get non-zero quantiles and use them to bin peak overlap by length + quantiles = gene_counts.loc[gene_counts[SEQ_COUNTS] != 0, SEQ_COUNTS].quantile(PEAK_QUANTILES) + gene_counts[SEQ_BIN] = 0 + + for i, qval in enumerate(quantiles.sort_values(ascending=True)): + gene_counts.loc[gene_counts[SEQ_COUNTS] >= qval, SEQ_BIN] = i + 1 + + # Rename the column with ID and reindex for join + gene_counts = gene_counts.rename({SEQ_BIN: id_name}).set_index(GTF_GENENAME).drop([SEQ_COUNTS], axis=1) + prior_data = prior_data.join(gene_counts, on=[GTF_GENENAME]) + + if output_path is not None: + prior_data.to_csv(output_path, sep="\t") + + return prior_data + + +def open_window(annotation_dataframe, window_size): + """ + This needs to adjust the start and stop in the annotation dataframe with window sizes + :param annotation_dataframe: pd.DataFrame + :param window_size: int + :return windowed_dataframe: pd.DataFrame + """ + windowed_dataframe = annotation_dataframe.copy() + windowed_dataframe[SEQ_START] = windowed_dataframe[SEQ_START] - window_size + windowed_dataframe[SEQ_STOP] = windowed_dataframe[SEQ_STOP] + window_size + windowed_dataframe.loc[windowed_dataframe[SEQ_START] < 0, SEQ_START] = 0 + return windowed_dataframe + +def open_tss(annotation_file, window_size): + """ + This needs to adjust the start and stop in the annotation dataframe with window sizes + :param annotation_dataframe: pd.DataFrame + :param window_size: int + :return windowed_dataframe: pd.DataFrame + """ + tss_dataframe = annotation_file.copy() + tss_dataframe[SEQ_START] = tss_dataframe[SEQ_TSS] - window_size + tss_dataframe[SEQ_STOP] = tss_dataframe[SEQ_TSS] + window_size + tss_dataframe.loc[tss_dataframe[SEQ_START] < 0, SEQ_START] = 0 + return tss_dataframe + +if __name__ == '__main__': + main() diff --git a/inferelator_prior/RNASeqTomat0.py b/inferelator_prior/RNASeqTomat0.py new file mode 100644 index 0000000..0f3a7fd --- /dev/null +++ b/inferelator_prior/RNASeqTomat0.py @@ -0,0 +1,134 @@ +from __future__ import print_function + +import os + +import pandas as pd + +from inferelator_prior import SRR_SUBPATH, FASTQ_SUBPATH, STAR_ALIGNMENT_SUBPATH, HTSEQ_ALIGNMENT_SUBPATH +from inferelator_prior.processor.htseq_count import htseq_count_aligned +from inferelator_prior.processor.matrix import pileup_raw_counts, normalize_matrix_to_fpkm, normalize_matrix_to_tpm +from inferelator_prior.processor.srr import get_srr_files, unpack_srr_files +from inferelator_prior.processor.star import star_align_fastqs +from inferelator_prior.processor.utils import file_path_abs, test_requirements_exist, ArgParseTestRequirements + +OUTPUT_COUNT_FILE_NAME = "srr_counts.tsv" +OUTPUT_COUNT_METADATA_NAME = "srr_alignment_metadata.tsv" +OUTPUT_FPKM_FILE_NAME = "srr_fpkm.tsv" +OUTPUT_TPM_FILE_NAME = "srr_tpm.tsv" + + +def main(): + ap = ArgParseTestRequirements(description="Turn a list of RNAseq expression SRRs from NCBI GEO into a count matrix") + ap.add_argument("-s", "--srr", dest="srr", help="SRR record IDs", nargs="+", metavar="SRRID", default=None) + ap.add_argument("-f", "--file", dest="file", help="List of SRR records in a TXT file", metavar="FILE", default=None) + ap.add_argument("-g", "--genome", dest="genome", help="STAR reference genome", metavar="PATH", required=True) + ap.add_argument("-a", "--annotation", dest="anno", help="GTF/GFF Annotation File", metavar="FILE", required=True) + ap.add_argument("-o", "--out", dest="out", help="Output PATH", metavar="PATH", required=True) + ap.add_argument("--gzip", dest="gzip", help="GZIP output file", action='store_const', const=True, default=False) + ap.add_argument("--cpu", dest="cpu", help="NUM of cores to use", metavar="NUM", type=int, default=4) + ap.add_argument("--star_jobs", dest="sjob", help="NUM of STAR workers to use", metavar="NUM", type=int, default=4) + + args, star_args = ap.parse_known_args() + test_requirements_exist() + + srr_ids = list() + + if args.srr is None and args.file is None: + print("One of --srr or --file must be set") + exit(1) + elif args.srr is not None and args.file is not None: + print("Only one of --srr or --file may be set (not both)") + exit(1) + elif args.srr is not None: + # SRR IDs are provided at command line + srr_ids = args.srr + elif args.file is not None: + # SRR IDs are in a .txt file; read them into a list + srr_ids = pd.read_csv(args.file, sep="\t", index_col=None, header=None).iloc[:, 0].tolist() + else: + raise ValueError("There is something wrong with this switch") + + srr_tomat0(srr_ids, args.out, args.genome, args.anno, gzip_output=args.gzip, cores=args.cpu, star_jobs=args.sjob, + star_args=star_args) + + +def srr_tomat0(srr_ids, output_path, star_reference_genome, annotation_file, gzip_output=False, cores=4, star_jobs=2, + star_args=None): + star_args = [] if star_args is None else star_args + + output_path = file_path_abs(output_path) + os.makedirs(output_path, exist_ok=True) + + # Download all the SRR files + print("Downloading SRR files") + os.makedirs(os.path.join(output_path, SRR_SUBPATH), exist_ok=True) + srr_file_names = get_srr_files(srr_ids, os.path.join(output_path, SRR_SUBPATH), num_workers=cores) + + # Unpack all the SRR files into FASTQ files + print("Unpacking SRR files") + os.makedirs(os.path.join(output_path, FASTQ_SUBPATH), exist_ok=True) + fastq_file_names = unpack_srr_files(srr_ids, srr_file_names, os.path.join(output_path, FASTQ_SUBPATH), + num_workers=cores) + + # Run all the FASTQ files through STAR to align + print("Aligning FASTQ files") + os.makedirs(os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), exist_ok=True) + thread_count = max(int(cores / len(srr_ids)), int(cores / star_jobs)) + sam_file_names = star_align_fastqs(srr_ids, fastq_file_names, star_reference_genome, + os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), + num_workers=star_jobs, threads_per_worker=thread_count, star_options=star_args) + + # Run all the SAM files through HTSeq.count to count + print("Counting SAM alignments") + os.makedirs(os.path.join(output_path, HTSEQ_ALIGNMENT_SUBPATH), exist_ok=True) + count_file_names = htseq_count_aligned(srr_ids, sam_file_names, annotation_file, + os.path.join(output_path, HTSEQ_ALIGNMENT_SUBPATH), num_workers=cores) + + # Convert the count files into a matrix and save it to a TSV + print("Assembling result matrix") + count_matrix, count_metadata = pileup_raw_counts(srr_ids, count_file_names) + count_matrix_file_name = os.path.join(output_path, OUTPUT_COUNT_FILE_NAME) + + # Save the raw counts file + if gzip_output: + count_matrix.to_csv(count_matrix_file_name + ".gz", compression='gzip', sep="\t") + else: + count_matrix.to_csv(count_matrix_file_name, sep="\t") + + # Save the count metadata file + count_metadata.to_csv(os.path.join(output_path, OUTPUT_COUNT_METADATA_NAME), sep="\t") + + # Normalize to FPKM + print("Normalizing result matrix to FPKM") + normalized_count_matrix_fpkm = normalize_matrix_to_fpkm(count_matrix, annotation_file) + fpkm_file_name = os.path.join(output_path, OUTPUT_FPKM_FILE_NAME) + + # Save the normalized counts file + if gzip_output: + normalized_count_matrix_fpkm.to_csv(fpkm_file_name + ".gz", compression='gzip', sep="\t") + else: + normalized_count_matrix_fpkm.to_csv(fpkm_file_name, sep="\t") + + # Normalize to TPM + print("Normalizing result matrix to TPM") + normalized_count_matrix_tpm = normalize_matrix_to_tpm(count_matrix, annotation_file) + tpmx_file_name = os.path.join(output_path, OUTPUT_TPM_FILE_NAME) + + # Save the normalized counts file + if gzip_output: + normalized_count_matrix_tpm.to_csv(tpmx_file_name + ".gz", compression='gzip', sep="\t") + else: + normalized_count_matrix_tpm.to_csv(tpmx_file_name, sep="\t") + + print("Count file {sh} generated from {srlen} SRA files".format(sh=count_matrix.shape, srlen=len(srr_ids))) + failed_counts = list(map(lambda x: x is None, count_file_names)) + + if any(failed_counts): + print("{n} Sequence Records could not be counted:".format(n=sum(failed_counts)), end="") + print("\n\t".join([sid for sid, fail in zip(srr_ids, failed_counts) if fail])) + + return count_matrix + + +if __name__ == '__main__': + main() diff --git a/inferelator_prior/__init__.py b/inferelator_prior/__init__.py new file mode 100644 index 0000000..cf24741 --- /dev/null +++ b/inferelator_prior/__init__.py @@ -0,0 +1,25 @@ +import os + +__version__ = '0.1.0' + +# Paths to the executable files +# Defaults assume that executables are in the shell path + +STAR_EXECUTABLE_PATH = os.path.expanduser("STAR") +PREFETCH_EXECUTABLE_PATH = os.path.expanduser("prefetch") +FASTQDUMP_EXECUTABLE_PATH = os.path.expanduser("fastq-dump") +CHROMA_EXECUTABLE_PATH = os.path.expanduser("ChromA") +SAMTOOLS_EXECUTABLE_PATH = os.path.expanduser("samtools") +FIMO_EXECUTABLE_PATH = os.path.expanduser("fimo") +HOMER_EXECUTABLE_PATH = os.path.expanduser("homer2") +BEDTOOLS_EXECUTABLE_PATH = os.path.expanduser("bedtools") + +HTSEQ_MODULE_NAME = "HTSeq.scripts.count" +CHROMA_MODULE_NAME = "ChromA" + +SRR_SUBPATH = "SRR" +FASTQ_SUBPATH = "FASTQ" +STAR_ALIGNMENT_SUBPATH = "STAR" +HTSEQ_ALIGNMENT_SUBPATH = "HTSEQ" +BAM_SUBPATH = "BAM" +FIMO_SUBPATH = "FIMO" diff --git a/inferelator_prior/mkref.py b/inferelator_prior/mkref.py new file mode 100644 index 0000000..bbbe103 --- /dev/null +++ b/inferelator_prior/mkref.py @@ -0,0 +1,32 @@ +import argparse + +from inferelator_prior.processor.star import star_mkref + + +def main(): + ap = argparse.ArgumentParser(description="Create a reference genome. All other arguments will be passed to STAR.") + ap.add_argument("-f", "--fasta", dest="fasta", help="FASTA FILE(s)", nargs="+", metavar="FILE", default=None) + ap.add_argument("-a", "--annotation", dest="annotation", help="Annotation GTF/GFF FILE", metavar="FILE", + default=None) + ap.add_argument("-g", "--genome", dest="genome", help="Create standard ref genome", metavar="PATH", default=None) + ap.add_argument("-o", "--out", dest="out", help="Output PATH", metavar="PATH", required=True) + ap.add_argument("--cpu", dest="cpu", help="NUMBER of cores to use", metavar="PATH", type=int, default=4) + + args, star_args = ap.parse_known_args() + + if (args.fasta is None or args.annotation is None) and args.genome is None: + print("One of (--fasta and --annotation) or --genome must be set. Not neither.") + exit(0) + elif (args.fasta is not None or args.annotation is not None) and args.genome is not None: + print("One of (--fasta and --annotation) or --genome must be set. Not both.") + elif args.genome is not None: + star_mkref(args.out, default_genome=args.genome, cores=args.cpu, star_options=star_args) + elif args.fasta is not None and args.annotation is not None: + star_mkref(args.out, genome_file=args.fasta, annotation_file=args.annotation, star_options=star_args, + cores=args.cpu) + else: + raise ValueError("Switch error") + + +if __name__ == '__main__': + main() diff --git a/inferelator_prior/motifs/__init__.py b/inferelator_prior/motifs/__init__.py new file mode 100644 index 0000000..bb97da2 --- /dev/null +++ b/inferelator_prior/motifs/__init__.py @@ -0,0 +1,4 @@ +from inferelator_prior.motifs._motif import (Motif, motifs_to_dataframe, chunk_motifs, + INFO_COL, MOTIF_COL, ENTROPY_COL, LEN_COL, OCC_COL, MOTIF_NAME_COL, SCAN_SCORE_COL, + SCORE_PER_BASE) +from inferelator_prior.motifs.motif_scan import MotifScan diff --git a/inferelator_prior/motifs/_motif.py b/inferelator_prior/motifs/_motif.py new file mode 100644 index 0000000..0f25205 --- /dev/null +++ b/inferelator_prior/motifs/_motif.py @@ -0,0 +1,385 @@ +import numpy as np +import pandas as pd +import warnings +import os +import tempfile +import math +import itertools +import pathos +from collections import Counter + +from inferelator_prior.processor.bedtools import extract_bed_sequence, intersect_bed, load_bed_to_bedtools + +INFO_COL = "Information Content" +ENTROPY_COL = "Shannon Entropy" +OCC_COL = "Occurrence" +LEN_COL = "Length" +MOTIF_COL = "Motif_ID" +MOTIF_NAME_COL = "Motif_Name" + +SCAN_SCORE_COL = "Tomat0_Score" +SCORE_PER_BASE = "Per Base Array" + + +class Motif: + motif_id = None + motif_name = None + motif_url = None + + _motif_probs = None + _motif_counts = None + _motif_prob_array = None + _motif_alphabet = None + _motif_background = None + _motif_species = None + _motif_accession = None + _alphabet_map = None + _consensus_seq = None + _info_matrix = None + _homer_odds = None + + @property + def alphabet(self): + return self._motif_alphabet + + @alphabet.setter + def alphabet(self, new_alphabet): + if new_alphabet is not None: + self._motif_alphabet = new_alphabet + self._alphabet_map = {ch.lower(): i for i, ch in enumerate(self._motif_alphabet)} + + @property + def accession(self): + return self._motif_accession + + @accession.setter + def accession(self, new_accession): + if new_accession is not None: + self._motif_accession = new_accession + + @property + def id(self): + return self.motif_id + + @id.setter + def id(self, new_id): + if new_id is not None: + self.motif_id = new_id + + @property + def name(self): + return self.motif_name + + @name.setter + def name(self, new_name): + if new_name is not None: + self.motif_name = new_name + + @property + def alphabet_len(self): + return len(self._motif_alphabet) + + @property + def background(self): + if self._motif_background is None: + self._motif_background = np.array([[1 / self.alphabet_len] * self.alphabet_len]) + return self._motif_background + + @property + def probability_matrix(self): + if self._motif_prob_array is None and len(self._motif_probs) == 0: + return None + if self._motif_prob_array is None or self._motif_prob_array.shape[0] < len(self._motif_probs): + self._motif_prob_array = np.array(self._motif_probs) + return self._motif_prob_array + + @probability_matrix.setter + def probability_matrix(self, matrix): + self._motif_prob_array = matrix + + @property + def count_matrix(self): + return np.array(self._motif_counts) if self._motif_counts is not None else None + + @property + def shannon_entropy(self): + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=RuntimeWarning) + + # Calculate -1 * p log p and set to 0 where p is already 0 + entropy = np.multiply(self.probability_matrix, np.log2(self.probability_matrix)) + entropy[~np.isfinite(entropy)] = 0 + entropy *= -1 + + return np.sum(entropy) + + @property + def information_content(self): + if self.probability_matrix is None: + return 0 + + return np.sum(self.ic_matrix) + + @property + def homer_odds(self): + return self.threshold_ln_odds if self._homer_odds is None else self._homer_odds + + @homer_odds.setter + def homer_odds(self, val): + self._homer_odds = val + + @property + def ic_matrix(self): + if self.probability_matrix is None: + return None + + if self._info_matrix is None or self._info_matrix.shape != self.probability_matrix.shape: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=RuntimeWarning) + + # Calculate p log (p/background) + self._info_matrix = np.divide(self.probability_matrix, self.background.reshape(1, -1)) + self._info_matrix = np.multiply(self.probability_matrix, np.log2(self._info_matrix)) + self._info_matrix[~np.isfinite(self._info_matrix)] = 0. + self._info_matrix = np.maximum(self._info_matrix, 0.) + + return self._info_matrix + + @property + def expected_occurrence_rate(self): + return int(2 ** self.information_content) + + @property + def consensus(self): + if self._consensus_seq is None: + self._consensus_seq = "".join(np.apply_along_axis(lambda x: self.alphabet[x.argmax()], axis=1, + arr=self.probability_matrix)) + return self._consensus_seq + + @property + def max_ln_odds(self): + max_ln_odd = np.log(np.amax(self.probability_matrix, axis=1) / 0.25) + return np.sum(max_ln_odd) + + @property + def threshold_ln_odds(self): + second_prob = np.sort(self.probability_matrix, axis=1)[:, 2] + return self.max_ln_odds - max((np.sum(np.log(second_prob[second_prob > 0.25] / 0.25)), 0.1 * self.max_ln_odds)) + + @property + def species(self): + return self._motif_species + + @species.setter + def species(self, new_species): + is_list = isinstance(new_species, (list, tuple)) + + if is_list and self._motif_species is None: + self._motif_species = new_species + elif is_list: + self._motif_species.extend(new_species) + elif self._motif_species is None: + self._motif_species = [new_species] + else: + self._motif_species.append(new_species) + + def __len__(self): + return self.probability_matrix.shape[0] if self.probability_matrix is not None else 0 + + def __str__(self): + return "{mid} {mname}: Width {el} IC {ic:.2f} bits".format(mid=self.motif_id, + mname=self.motif_name, + el=len(self), + ic=self.information_content) + + def __init__(self, motif_id=None, motif_name=None, motif_alphabet=None, motif_background=None): + self.id = motif_id + self.name = motif_name + self.alphabet = motif_alphabet + self._motif_background = motif_background + self._motif_probs = [] + + def add_prob_line(self, line): + self._motif_probs.append(line) + + def add_count_line(self, line): + if self._motif_counts is not None: + self._motif_counts.append(line) + else: + self._motif_counts = [line] + + def score_match(self, match, disallow_homopolymer=True, homopolymer_one_off_len=6, score_zero_as_zero=None): + + if len(match) != len(self): + msg = "Sequence length {l} not compatible with motif length {m}".format(l=len(match), m=len(self)) + raise ValueError(msg) + + # Score anything that's a homopolymer to 0 if the flag is set + if disallow_homopolymer and sum([m == match[0] for m in match]) == len(match): + return 0 + + # Score anything that's one base from a homopolymer to 0 if the flag is set + if disallow_homopolymer and (len(match) > homopolymer_one_off_len and + sum([min((c, 2)) for c in Counter(match).values()]) < 4): + return 0 + + # Score anything with excessive nucleotides that have a p ~ 0.0 as 0 + if score_zero_as_zero is not None and sum(p < 0.001 for p in self._prob_match(match)) > score_zero_as_zero: + return 0 + + mse_ic = np.sum(np.square(np.subtract(self._info_match(self.consensus), self._info_match(match)))) + return max((np.sum(self._info_match(match)) - mse_ic, 0.)) + + def truncate(self, threshold=0.35): + threshold = np.max(self.probability_matrix, axis=1) > threshold + keepers = (threshold.cumsum() > 0) & (threshold[::-1].cumsum()[::-1] > 0) + self.probability_matrix = self.probability_matrix[keepers, :] + self._motif_probs = list(itertools.compress(self._motif_probs, keepers)) + + def _prob_match(self, match): + return [self.probability_matrix[i, self._alphabet_map[ch.lower()]] for i, ch in enumerate(match)] + + def _info_match(self, match): + return [self.ic_matrix[i, self._alphabet_map[ch.lower()]] for i, ch in enumerate(match)] + + def species_contains(self, match_str): + if self.species is not None: + match_str = match_str.lower() + return any(match_str in s.lower() for s in self.species) + else: + return False + + +class __MotifScanner: + + def __init__(self, motif_file=None, motifs=None, num_workers=4): + + if (motif_file is None and motifs is None) or (motif_file is not None and motifs is not None): + raise ValueError("One of meme_file or motifs must be passed") + + self.motif_file = motif_file + self.motifs = motifs + self.num_workers = num_workers + + def scan(self, genome_fasta_file, atac_bed_file=None, promoter_bed=None, min_ic=None, threshold=None): + """ + """ + + # Preprocess motifs into a list of temp chunk files + motif_files = self._preprocess(min_ic=min_ic) + # Unpack list to a dict for convenience + self.motifs = {mot.motif_id: mot for mot in self.motifs} + + try: + if atac_bed_file is None and promoter_bed is None: + motif_data = self._scan_extract(motif_files, genome_fasta_file, threshold=threshold) + return self._postprocess(motif_data) + elif atac_bed_file is not None and promoter_bed is None: + bed_file = load_bed_to_bedtools(atac_bed_file) + elif atac_bed_file is None and promoter_bed is not None: + bed_file = load_bed_to_bedtools(promoter_bed) + else: + bed_file = intersect_bed(load_bed_to_bedtools(atac_bed_file), load_bed_to_bedtools(promoter_bed)) + + extracted_fasta_file = extract_bed_sequence(bed_file, genome_fasta_file) + + try: + motif_data = self._scan_extract(motif_files, extracted_fasta_file, threshold=threshold) + return self._postprocess(motif_data) + finally: + try: + os.remove(extracted_fasta_file) + except FileNotFoundError: + pass + + finally: + for file in motif_files: + try: + os.remove(file) + except FileNotFoundError: + pass + + def _scan_extract(self, motif_files, extracted_fasta_file, threshold=None): + # If the number of workers is 1, run fimo directly + if self.num_workers == 1: + assert len(motif_files) == 1 + return self._get_motifs(extracted_fasta_file, motif_files[0], threshold=threshold) + + # Otherwise parallelize with a process pool (pathos because dill will do local functions) + else: + # Convenience local function + def _get_chunk_motifs(chunk_file): + return self._get_motifs(extracted_fasta_file, chunk_file, threshold=threshold) + + with pathos.multiprocessing.Pool(self.num_workers) as pool: + motif_data = [data for data in pool.imap(_get_chunk_motifs, motif_files)] + motif_data = pd.concat(motif_data) + + return motif_data + + def _preprocess(self, min_ic=None): + raise NotImplementedError + + def _postprocess(self, motif_peaks): + raise NotImplementedError + + def _get_motifs(self, fasta_file, motif_file, threshold=None): + raise NotImplementedError + + def _parse_output(self, output_handle): + raise NotImplementedError + + +def motifs_to_dataframe(motifs): + entropy = list(map(lambda x: x.shannon_entropy, motifs)) + occurrence = list(map(lambda x: x.expected_occurrence_rate, motifs)) + info = list(map(lambda x: x.information_content, motifs)) + ids = list(map(lambda x: x.motif_id, motifs)) + names = list(map(lambda x: x.motif_name, motifs)) + + df = pd.DataFrame( + [ids, names, info, entropy, occurrence, list(map(lambda x: len(x), motifs))], + columns=list(map(lambda x: x.motif_name, motifs)), + index=[MOTIF_COL, MOTIF_NAME_COL, INFO_COL, ENTROPY_COL, OCC_COL, LEN_COL]).T + + return df + + +def chunk_motifs(file_type, motifs, num_workers=4, min_ic=None): + """ + Break a motif file up into chunks + :param file_type: The meme or homer namespaces with a .read() and .write() function + :type file_type: inferelator_prior.motifs parser + :param motifs: Motif object list; pass either meme_file or motifs + :type motifs: list(Motif), None + :param num_workers: number of chunks to make + :type num_workers: int + :param min_ic: set an information content minimum on motifs to include if this is not None + :type min_ic: float + :return: List of chunked motif files + :rtype: list + """ + + temp_dir = tempfile.gettempdir() + + if min_ic is not None: + motifs = list(itertools.compress(motifs, [m.information_content >= min_ic for m in motifs])) + + if num_workers == 1: + file_name = os.path.join(temp_dir, "chunk1.mchunk") + file_type.write(file_name, motifs) + return [file_name] + + num_workers = len(motifs) if num_workers > len(motifs) else num_workers + chunk_index = np.repeat(np.arange(num_workers).reshape(1, -1), np.ceil(len(motifs) / num_workers), axis=0).flatten() + chunk_index = chunk_index[0:len(motifs)] + + files = [] + + for i in range(num_workers): + file_name = os.path.join(temp_dir, "chunk" + str(i) + ".mchunk") + file_type.write(file_name, [m for m, b in zip(motifs, (chunk_index == i)) if b]) + files.append(file_name) + + return files diff --git a/inferelator_prior/motifs/fimo.py b/inferelator_prior/motifs/fimo.py new file mode 100644 index 0000000..30344b8 --- /dev/null +++ b/inferelator_prior/motifs/fimo.py @@ -0,0 +1,68 @@ +import io +import subprocess +import pandas as pd +import numpy as np +import pandas.errors as pde + +from inferelator_prior import FIMO_EXECUTABLE_PATH +from inferelator_prior.motifs import meme, chunk_motifs, SCAN_SCORE_COL, SCORE_PER_BASE +from inferelator_prior.motifs._motif import __MotifScanner + +FIMO_DATA_SUFFIX = ".fimo.tsv" + +FIMO_MOTIF = 'motif_id' +FIMO_MOTIF_COMMON = 'motif_alt_id' +FIMO_CHROMOSOME = 'sequence_name' +FIMO_STRAND = 'strand' +FIMO_START = 'start' +FIMO_STOP = 'stop' +FIMO_SCORE = 'p-value' +FIMO_SEQUENCE = 'matched_sequence' + +FIMO_COMMAND = [FIMO_EXECUTABLE_PATH, "--text", "--parse-genomic-coord"] + + +class FIMOScanner(__MotifScanner): + + def _preprocess(self, min_ic=None): + if self.motif_file is not None: + self.motifs = meme.read(self.motif_file) + + return chunk_motifs(meme, self.motifs, num_workers=self.num_workers, min_ic=min_ic) + + def _postprocess(self, motif_peaks): + motif_peaks = motif_peaks.drop_duplicates(subset=[FIMO_MOTIF, FIMO_START, FIMO_STOP, FIMO_CHROMOSOME]) + return motif_peaks + + def _get_motifs(self, fasta_file, motif_file, threshold=None): + if threshold is None: + fimo_command = FIMO_COMMAND + [motif_file, fasta_file] + else: + fimo_command = FIMO_COMMAND + ["--thresh", str(threshold)] + [motif_file, fasta_file] + + proc = subprocess.run(fimo_command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + if int(proc.returncode) != 0: + print("fimo motif scan failed for {meme}, {fa} ({cmd})".format(meme=motif_file, + fa=fasta_file, + cmd=" ".join(fimo_command))) + + return self._parse_output(io.StringIO(proc.stdout.decode("utf-8"))) + + def _parse_output(self, output_handle): + try: + motifs = pd.read_csv(output_handle, sep="\t", index_col=None) + motifs.dropna(subset=[FIMO_START, FIMO_STOP], inplace=True, how='any') + motifs[FIMO_START], motifs[FIMO_STOP] = motifs[FIMO_START].astype(int), motifs[FIMO_STOP].astype(int) + + if "#pattern name" in motifs.columns: + raise RuntimeError("FIMO version not supported; update to 5.0.5") + + motifs[SCAN_SCORE_COL] = [self.motifs[x].score_match(y) for x, y in + zip(motifs[FIMO_MOTIF], motifs[FIMO_SEQUENCE])] + motifs[SCORE_PER_BASE] = [np.array(self.motifs[x]._info_match(y)) for x, y in + zip(motifs[FIMO_MOTIF], motifs[FIMO_SEQUENCE])] + + return motifs + except pde.EmptyDataError: + return None diff --git a/inferelator_prior/motifs/homer.py b/inferelator_prior/motifs/homer.py new file mode 100644 index 0000000..04a23a7 --- /dev/null +++ b/inferelator_prior/motifs/homer.py @@ -0,0 +1,70 @@ +import subprocess +import io +import pandas as pd +import numpy as np + +from inferelator_prior.motifs import chunk_motifs, homer_motif, SCAN_SCORE_COL, SCORE_PER_BASE +from inferelator_prior.motifs._motif import __MotifScanner +from inferelator_prior import HOMER_EXECUTABLE_PATH + +HOMER_DATA_SUFFIX = ".homer.tsv" + +HOMER_SEQ_ID = 'seqid' +HOMER_OFFSET = 'offset' +HOMER_MATCH = 'match' +HOMER_MOTIF = 'motif_id' +HOMER_STRAND = 'strand' +HOMER_SCORE = 'score' +HOMER_CHROMOSOME = 'sequence_name' +HOMER_START = 'start' +HOMER_STOP = 'stop' + +HOMER2_FIND_COLS = [HOMER_SEQ_ID, HOMER_OFFSET, HOMER_MATCH, HOMER_MOTIF, HOMER_STRAND, HOMER_SCORE] +HOMER2_EXPAND_STR_COLS = [HOMER_CHROMOSOME, HOMER_START, HOMER_STOP] + + +class HOMERScanner(__MotifScanner): + + def _preprocess(self, min_ic=None): + if self.motif_file is not None: + self.motifs = homer_motif.read(self.motif_file) + + return chunk_motifs(homer_motif, self.motifs, num_workers=self.num_workers, min_ic=min_ic) + + def _postprocess(self, motif_peaks): + motif_peaks = motif_peaks.drop_duplicates(subset=[HOMER_MOTIF, HOMER_START, HOMER_STOP, HOMER_CHROMOSOME]) + return motif_peaks + + def _get_motifs(self, fasta_file, motif_file, threshold=None): + homer_command = [HOMER_EXECUTABLE_PATH, "find", "-i", fasta_file, "-m", motif_file, "-offset", str(0)] + proc = subprocess.run(homer_command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + if int(proc.returncode) != 0: + print("HOMER motif scan failed for {meme}, {fa} (cmd)".format(meme=motif_file, + fa=fasta_file, + cmd=" ".join(homer_command))) + + return self._parse_output(io.StringIO(proc.stdout.decode("utf-8"))) + + def _parse_output(self, output_handle): + motifs = pd.read_csv(output_handle, sep="\t", index_col=None, names=HOMER2_FIND_COLS) + + loc_data = motifs[HOMER_SEQ_ID].str.split(r"[\:\-]", expand=True) + loc_data.columns = HOMER2_EXPAND_STR_COLS if loc_data.shape[1] == 3 else HOMER2_EXPAND_STR_COLS + ["UNKNOWN"] + loc_data[HOMER_START] = loc_data[HOMER_START].astype(int) + motifs[HOMER_OFFSET] + + match_width = motifs[HOMER_MATCH].str.len() + + loc_data.loc[motifs[HOMER_STRAND] == "-", HOMER_START] -= match_width.loc[motifs[HOMER_STRAND] == "-"] - 1 + + loc_data[HOMER_STOP] = loc_data[HOMER_START] + motifs[HOMER_MATCH].str.len() + + motifs[[HOMER_CHROMOSOME, HOMER_START, HOMER_STOP]] = loc_data[[HOMER_CHROMOSOME, HOMER_START, HOMER_STOP]] + motifs.drop([HOMER_SEQ_ID, HOMER_OFFSET], inplace=True, axis=1) + + motifs[SCAN_SCORE_COL] = [self.motifs[x].score_match(y) for x, y in + zip(motifs[HOMER_MOTIF], motifs[HOMER_MATCH])] + motifs[SCORE_PER_BASE] = [np.array(self.motifs[x]._info_match(y)) for x, y in + zip(motifs[HOMER_MOTIF], motifs[HOMER_MATCH])] + + return motifs diff --git a/inferelator_prior/motifs/homer_motif.py b/inferelator_prior/motifs/homer_motif.py new file mode 100644 index 0000000..b49f1d7 --- /dev/null +++ b/inferelator_prior/motifs/homer_motif.py @@ -0,0 +1,71 @@ +from inferelator_prior.motifs import Motif + +HOMER_MOTIF_RECORD = """\ +>{consensus}\t{mname}\t{odds_score:.6f} +{pmatrix}""" + +HOMER_ALPHABET = "ACGT" + + +def read(file_descript): + + # Parse if it's a string + if isinstance(file_descript, str): + with open(file_descript) as motif_fh: + return [m for m in __parse_motif_gen(motif_fh)] + + # Parse if it's a file handle + else: + return [m for m in __parse_motif_gen(file_descript)] + + +def write(file_descript, motifs, alphabet=None, background=None, mode="w"): + + motifs = [motifs] if not isinstance(motifs, list) else motifs + + # Write if it's a string + if isinstance(file_descript, str): + with open(file_descript, mode=mode) as motif_fh: + for motif in motifs: + __write_motif(motif_fh, motif) + + # Write if it's a file handle + else: + for motif in motifs: + __write_motif(file_descript, motif) + + +def __parse_motif_gen(handle): + + active_motif = None + + for line in handle: + line = line.strip() + + if len(line) > 0 and line.lower().startswith(">"): + if active_motif is not None: + yield active_motif + line = line.split() + active_motif = Motif(line[1], None, list(HOMER_ALPHABET)) + active_motif.homer_odds = line[2] + elif len(line) > 0: + probs = line.split() + if active_motif is not None and len(probs) == len(HOMER_ALPHABET): + active_motif.add_prob_line(list(map(lambda x: float(x), probs))) + + if active_motif is not None: + yield active_motif + + +def __write_motif(motif_fh, motif): + + if motif.alphabet is not None and "".join(motif.alphabet).upper() != HOMER_ALPHABET: + raise ValueError("HOMER requires ACGT alphabet only") + + p_mat = "\n".join(["\t".join(map(lambda x: " {:.4f}".format(x), r)) for r in motif.probability_matrix]) + + record = HOMER_MOTIF_RECORD.format(consensus=motif.consensus, + mname=motif.motif_id, + odds_score=motif.homer_odds, + pmatrix=p_mat) + print(record, file=motif_fh) diff --git a/inferelator_prior/motifs/meme.py b/inferelator_prior/motifs/meme.py new file mode 100644 index 0000000..9c242e5 --- /dev/null +++ b/inferelator_prior/motifs/meme.py @@ -0,0 +1,179 @@ +from inferelator_prior.motifs import Motif + +import numpy as np + +MEME4_HEADER = """\ +MEME version 4 + +ALPHABET= {alphabet} + +strands: {strands} + +Background letter frequencies: +{bkgd} +""" + +MEME4_RECORD = """\ +MOTIF {motif_id} {motif_name} + +letter-probability matrix: alength= {alen} w= {w} +{pmatrix} + +URL {url} +""" + + +def read(file_descript): + + # Parse if it's a string + if isinstance(file_descript, str): + with open(file_descript) as motif_fh: + return [m for m in __parse_meme_file(motif_fh)] + + # Parse if it's a file handle + else: + return [m for m in __parse_meme_file(file_descript)] + + +def write(file_descript, motifs, alphabet=None, background=None, mode="w"): + + motifs = [motifs] if not isinstance(motifs, list) else motifs + alphabet = alphabet if alphabet is not None else motifs[0].alphabet + background = np.array([[1 / len(alphabet)] * len(alphabet)]) if background is None else background + + def _write_file(fh): + __write_header(fh, alphabet, background) + for motif in motifs: + __write_motif(fh, motif) + + # Write if it's a string + if isinstance(file_descript, str): + with open(file_descript, mode=mode) as motif_fh: + _write_file(motif_fh) + + # Write if it's a file handle + else: + _write_file(file_descript) + + +def __parse_meme_file(meme_fh): + alph = __parse_alphabet(meme_fh) + + neg_strand, pos_strand = __parse_strand(meme_fh, strict=False) + + bkgd = __parse_background(meme_fh, strict=False) + bkgd = np.array([[1 / len(alph)] * len(alph)]) if bkgd is None else np.array([[bkgd[a] for a in alph]]) + + return [m for m in __parse_motif_gen(meme_fh, alph, bkgd)] + + +def __parse_alphabet(handle, strict=True): + + for line in handle: + if line.strip().lower().startswith("alphabet"): + handle.seek(0) + return list(line.strip().split()[-1]) + + if strict: + raise MEMEDatabaseError("Unable to locate `ALPHABET =` line") + + +def __parse_strand(handle, strict=True): + + for line in handle: + if line.strip().lower().startswith("strands"): + handle.seek(0) + strands = "".join(line.strip().split()[-2:]) + return "-" in strands, "+" in strands + + if strict: + raise MEMEDatabaseError("Unable to locate `ALPHABET =` line") + else: + handle.seek(0) + return True, True + + +def __parse_background(handle, strict=True): + + find_flag = False + + for line in handle: + line = line.strip() + if line.lower().startswith("background"): + find_flag = True + continue + if len(line) > 0 and find_flag: + probs = line.split() + + if len(probs) % 2 != 0: + raise MEMEDatabaseError("Background probabilities do not parse correctly") + + handle.seek(0) + return {a: float(b) for a, b in zip(probs[::2], probs[1::2])} + + if strict: + raise MEMEDatabaseError("Unable to locate background probabilities") + else: + return None + + +def __parse_motif_gen(handle, alphabet, background): + + active_motif = None + + for line in handle: + line = line.strip() + + if active_motif is None and line.lower().startswith("motif"): + line = line.split() + active_motif = Motif(line[1], line[2] if len(line) > 2 else None, alphabet, background) + continue + elif active_motif is not None and line.lower().startswith("motif"): + yield active_motif + line = line.split() + active_motif = Motif(line[1], line[2] if len(line) > 2 else None, alphabet, background) + continue + + if line.lower().startswith("letter-probability") or len(line) == 0: + continue + + if line.lower().startswith("url") and active_motif is not None: + active_motif.motif_url = line.split()[-1].strip() + continue + + probs = line.split() + if active_motif is not None and len(probs) == len(alphabet): + active_motif.add_prob_line(list(map(lambda x: float(x), probs))) + + if active_motif is not None: + yield active_motif + + +def __write_header(handle, alphabet, bkgd, pos_strand=True, neg_strand=True): + + strands = ["-"] if neg_strand and not pos_strand else ['+'] if pos_strand and not neg_strand else ["-", "+"] + bkgd = ["{} {:.5f}".format(a, b) for a, b in zip(alphabet, bkgd.flatten().tolist())] + + meme4_header = MEME4_HEADER.format(alphabet="".join(alphabet), + strands=" ".join(strands), + bkgd=" ".join(bkgd)) + + print(meme4_header, file=handle) + + +def __write_motif(handle, motif): + + p_mat = "\n".join(["\t".join(map(lambda x: " {:.6f}".format(x), r)) for r in motif.probability_matrix]) + + meme4_record = MEME4_RECORD.format(motif_id=motif.motif_id if motif.motif_id is not None else "", + motif_name=motif.motif_name if motif.motif_name is not None else "", + alen=motif.alphabet_len, + w=len(motif), + pmatrix=p_mat, + url=motif.motif_url if motif.motif_url is not None else "") + + print(meme4_record, file=handle) + + +class MEMEDatabaseError(ValueError): + pass diff --git a/inferelator_prior/motifs/motif_scan.py b/inferelator_prior/motifs/motif_scan.py new file mode 100644 index 0000000..d28e852 --- /dev/null +++ b/inferelator_prior/motifs/motif_scan.py @@ -0,0 +1,49 @@ +from inferelator_prior.motifs.fimo import FIMO_MOTIF, FIMO_SCORE, FIMO_START, FIMO_STOP, FIMO_CHROMOSOME, FIMOScanner +from inferelator_prior.motifs.homer import HOMER_MOTIF, HOMER_SCORE, HOMER_START, HOMER_STOP, HOMER_CHROMOSOME, HOMERScanner +from inferelator_prior.motifs import meme +from inferelator_prior.motifs import homer_motif + + +class MotifScan(object): + """ + This class handles keeping track of the info needed for each type of motif scanner + """ + + _motif_file_type = 'fimo' + + name_col = FIMO_MOTIF + score_col = FIMO_SCORE + chromosome_col = FIMO_CHROMOSOME + start_col = FIMO_START + stop_col = FIMO_STOP + scanner = FIMOScanner + + @classmethod + def set_type_fimo(cls): + cls.name_col = FIMO_MOTIF + cls.score_col = FIMO_SCORE + cls.chromosome_col = FIMO_CHROMOSOME + cls.start_col = FIMO_START + cls.stop_col = FIMO_STOP + + cls._motif_file_type = 'fimo' + cls.scanner = FIMOScanner + + @classmethod + def set_type_homer(cls): + cls.name_col = HOMER_MOTIF + cls.score_col = HOMER_SCORE + cls.chromosome_col = HOMER_CHROMOSOME + cls.start_col = HOMER_START + cls.stop_col = HOMER_STOP + + cls._motif_file_type = 'homer' + cls.scanner = HOMERScanner + + @classmethod + def load_motif_file(cls, motif_file_name): + if motif_file_name.lower().endswith(".meme"): + return meme.read(motif_file_name) + else: + return homer_motif.read(motif_file_name) + diff --git a/inferelator_prior/motifs/pwm.py b/inferelator_prior/motifs/pwm.py new file mode 100644 index 0000000..1d22f51 --- /dev/null +++ b/inferelator_prior/motifs/pwm.py @@ -0,0 +1,46 @@ +from inferelator_prior.motifs import Motif, MOTIF_COL + +import pandas as pd +import pandas.errors as pde +import os + +TF_NAME_COL = "TF_Name" +TF_STATUS_COL = "TF_Status" + + +def read(pwm_file_list, info_file, background=None, direct_only=False): + + info_df = pd.read_csv(info_file, sep="\t") + motifs = [] + + for pwm_file in pwm_file_list: + pwm_id = os.path.splitext(os.path.basename(pwm_file))[0] + + if direct_only: + direct = info_df.loc[info_df[MOTIF_COL] == pwm_id, TF_STATUS_COL].str.contains("D") + if not direct.any(): + continue + else: + pwm_names = info_df.loc[(info_df[MOTIF_COL] == pwm_id) & (info_df[TF_STATUS_COL] == "D"), TF_NAME_COL] + else: + pwm_names = info_df.loc[info_df[MOTIF_COL] == pwm_id, TF_NAME_COL] + + pwm_name = "/".join(pwm_names) + + try: + pwm = pd.read_csv(pwm_file, sep="\t", index_col=0) + except pde.ParserError: + print("Parser error on file {f}".format(f=pwm_name)) + continue + + pwm_alphabet = pwm.columns.tolist() + + motif = Motif(pwm_id, pwm_name, pwm_alphabet, motif_background=background) + motif.probability_matrix = pwm.values + + if min(pwm.values.shape) == 0: + continue + + motifs.append(motif) + + return motifs diff --git a/inferelator_prior/motifs/transfac.py b/inferelator_prior/motifs/transfac.py new file mode 100644 index 0000000..2c052e5 --- /dev/null +++ b/inferelator_prior/motifs/transfac.py @@ -0,0 +1,90 @@ +from inferelator_prior.motifs import Motif + +import numpy as np + +TRANSFAC_CODES = {"AC": "Accession", + "ID": "ID", + "NA": "Name", + "DT": "Date", + "CO": "Copyright", + "DE": "Description", + "TY": "Type", + "OS": "", + "OL": "", + "BF": "Species", + "P0": "Alphabet", + "SR": "", + "BA": "", + "CC": "", + "PR": "Profile"} + + +def read(file_descript): + + # Parse if it's a string + if isinstance(file_descript, str): + with open(file_descript) as motif_fh: + return [m for m in _parse_transfac_file(motif_fh)] + + # Parse if it's a file handle + else: + return [m for m in _parse_transfac_file(file_descript)] + + +def _parse_transfac_file(transfac_fh): + return [m for m in __parse_motif_gen(transfac_fh)] + + +def __parse_motif_gen(handle): + + active_motif = Motif() + + for line in handle: + line = line.strip() + + if len(line) < 2: + continue + + line_id, line = line[:2].upper(), line[2:].strip() + + # Spacer + if line_id == "XX": + continue + + # New record + elif line_id == "//" and len(active_motif) > 0: + yield active_motif + active_motif = Motif() + + elif line_id == "//": + active_motif = Motif() + + # Accession + elif line_id == "AC": + active_motif.accession = line + + # ID + elif line_id == "ID": + active_motif.motif_id = line + + # Name + elif line_id == "NA": + active_motif.motif_name = line + + # Alphabet + elif line_id == "P0": + active_motif.alphabet = line.split() + + elif line_id == "BF": + active_motif.species = line + + # Prob + elif line_id.isdigit(): + counts = list(map(float, line.split()[:-1])) + active_motif.add_count_line(counts) + total_seqs = sum(counts) + active_motif.add_prob_line(list(map(lambda x: x / total_seqs, counts))) + + if len(active_motif) > 0: + yield active_motif + diff --git a/inferelator_prior/network_from_motifs.py b/inferelator_prior/network_from_motifs.py new file mode 100644 index 0000000..79bf42d --- /dev/null +++ b/inferelator_prior/network_from_motifs.py @@ -0,0 +1,140 @@ +from inferelator_prior.processor.gtf import load_gtf_to_dataframe, open_window, GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND +from inferelator_prior.processor.prior import build_prior_from_atac_motifs, MotifScorer +from inferelator_prior.motifs.motif_scan import MotifScan +from inferelator_prior.motifs import motifs_to_dataframe, INFO_COL, MOTIF_NAME_COL + +import argparse +import os +import pathlib +import pandas as pd + + +def main(): + ap = argparse.ArgumentParser(description="Create a prior from a genome, TF motifs, and an optional BED file") + ap.add_argument("-m", "--motif", dest="motif", help="Motif file", metavar="PATH", required=True) + ap.add_argument("--motif_format", dest="motif_format", help="Motif file FORMAT (transfac or meme)", + metavar="FORMAT", default="meme") + ap.add_argument("-b", "--bed", dest="atac", help="BED file", metavar="FILE", default=None) + ap.add_argument("-f", "--fasta", dest="fasta", help="Genomic FASTA file", metavar="FILE", required=True) + ap.add_argument("-g", "--gtf", dest="annotation", help="GTF Annotation File", metavar="FILE", required=True) + ap.add_argument("-o", "--out", dest="out", help="Output PATH prefix", metavar="PATH", required=True) + ap.add_argument("-w", "--window", dest="window_size", help="Window around genes", type=int, default=0, nargs="+") + ap.add_argument("-c", "--cpu", dest="cores", help="Number of cores", metavar="CORES", type=int, default=1) + ap.add_argument("--no_tss", dest="tss", help="Use gene body for window (not TSS)", action='store_const', + const=False, default=True) + ap.add_argument("--scan", dest="scanner", help="FIMO or HOMER", type=str, default='fimo') + ap.add_argument("--motif_preprocessing_ic", dest="min_ic", help="Minimum information content", + metavar="BITS", type=int, default=None) + ap.add_argument("--tandem_window", dest="tandem", help="Bases between TF bindings to consider an array", + metavar="BASES", type=int, default=100) + ap.add_argument("--threshold", nargs="+", default=None, type=str) + + args = ap.parse_args() + out_prefix = os.path.abspath(os.path.expanduser(args.out)) + out_path = os.path.join(*pathlib.PurePath(out_prefix).parts[:-1]) + if not os.path.exists(out_path): + os.makedirs(out_prefix) + + if args.threshold is None: + prior_edges, prior_matrix, raw_matrix = build_atac_motif_prior(args.motif, args.atac, args.annotation, + args.fasta, + window_size=args.window_size, + num_cores=args.cores, + use_tss=args.tss, motif_ic=args.min_ic, + scanner_type=args.scanner, + motif_format=args.motif_format) + + prior_matrix.astype(int).to_csv(out_prefix + "_edge_matrix.tsv.gz", sep="\t") + prior_edges.to_csv(out_prefix + "_edge_table.tsv.gz", sep="\t") + raw_matrix.to_csv(out_prefix + "_unfiltered_matrix.tsv.gz", sep="\t") + else: + motifs = MotifScan.load_motif_file(args.motif) + motif_information = motifs_to_dataframe(motifs) + motif_information = motif_information[[MOTIF_NAME_COL, INFO_COL]].groupby(MOTIF_NAME_COL).agg("max") + + edge_count = {} + for t in args.threshold: + prior_edges, prior_matrix, raw_matrix = build_atac_motif_prior(args.motif, args.atac, args.annotation, + args.fasta, + window_size=args.window_size, + num_cores=args.cores, + use_tss=args.tss, motif_ic=args.min_ic, + scanner_type=args.scanner, + scanner_thresh=t, + motif_format=args.motif_format) + + edge_count[t] = (raw_matrix != 0).sum(axis=0) + + edge_count = pd.concat(edge_count, axis=1) + edge_count = edge_count.join(motif_information[INFO_COL]) + + edge_count.to_csv(out_prefix + "_edge_count.tsv", sep="\t") + + +def build_atac_motif_prior(motif_file, atac_bed_file, annotation_file, genomic_fasta_file, window_size=0, + use_tss=True, scanner_type='fimo', num_cores=1, motif_ic=6, tandem=100, + truncate_motifs=0.35, scanner_thresh="1e-4", motif_format="meme"): + # Set the scanner type + if scanner_type.lower() == 'fimo': + MotifScan.set_type_fimo() + elif scanner_type.lower() == 'homer': + MotifScan.set_type_homer() + else: + raise ValueError("motif_type must be fimo or homer") + + # PROCESS GENE ANNOTATIONS # + + print("Loading genes from file ({f})".format(f=annotation_file)) + # Load genes and open a window + genes = load_gtf_to_dataframe(annotation_file) + print("\t{n} genes loaded".format(n=genes.shape[0])) + + genes = open_window(genes, window_size=window_size, use_tss=use_tss, check_against_fasta=genomic_fasta_file) + print("\tPromoter regions defined with window {w}".format(w=window_size)) + + # PROCESS MOTIF PWMS # + + print("Loading motifs from file ({f})".format(f=motif_file)) + if motif_format.lower() == "meme": + from inferelator_prior.motifs.meme import read + elif motif_format.lower() == "transfac": + from inferelator_prior.motifs.transfac import read + elif motif_format.lower() == "homer": + from inferelator_prior.motifs.homer_motif import read + else: + raise ValueError("motif_format must be 'meme', 'homer', or 'transfac'") + + motifs = read(motif_file) + motif_information = motifs_to_dataframe(motifs) + print("\t{n} motifs loaded".format(n=len(motif_information))) + + if truncate_motifs is not None: + [x.truncate(threshold=truncate_motifs) for x in motifs] + + # SCAN CHROMATIN FOR MOTIFS # + + # Load and scan target chromatin peaks + print("Scanning target chromatin ({f_c}) for motifs ({f_m})".format(f_c=atac_bed_file, f_m=motif_file)) + + gene_locs = genes.loc[:, [GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND]].copy() + gene_locs[[SEQ_START, SEQ_STOP]] = gene_locs[[SEQ_START, SEQ_STOP]].astype(int) + + motif_peaks = MotifScan.scanner(motifs=motifs, num_workers=num_cores).scan(genomic_fasta_file, + atac_bed_file=atac_bed_file, + promoter_bed=gene_locs, + min_ic=motif_ic, + threshold=scanner_thresh) + + # PROCESS CHROMATIN PEAKS INTO NETWORK MATRIX # + + # Processing into prior + print("Processing TF binding sites into prior") + MotifScorer.set_information_criteria(min_binding_ic=motif_ic, max_dist=tandem) + prior_edges, prior_matrix, raw_matrix = build_prior_from_atac_motifs(genes, motif_peaks, motif_information, + num_workers=num_cores) + print("Prior matrix with {n} edges constructed".format(n=prior_edges.shape[0])) + return prior_edges, prior_matrix, raw_matrix + + +if __name__ == '__main__': + main() diff --git a/inferelator_prior/processor/__init__.py b/inferelator_prior/processor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inferelator_prior/processor/bedtools.py b/inferelator_prior/processor/bedtools.py new file mode 100644 index 0000000..f88369f --- /dev/null +++ b/inferelator_prior/processor/bedtools.py @@ -0,0 +1,123 @@ +from inferelator_prior.processor.gtf import GTF_CHROMOSOME, GTF_GENENAME, SEQ_START, SEQ_STOP, GTF_STRAND +import pandas as pd +import pybedtools +import os +import subprocess +import tempfile + +BEDTOOLS_EXTRACT_SUFFIX = ".extract.fasta" + +# Column names +BED_CHROMOSOME = 'chrom' + +SEQ_COUNTS = 'count' +SEQ_BIN = 'bin' +SEQ_SCORE = 'p-value' + + +def get_peaks_in_features(feature_dataframe, peak_dataframe, feature_group_column=GTF_CHROMOSOME, + peak_group_column=BED_CHROMOSOME): + genes = feature_dataframe.copy() + + # Add counts (and set to 0) + genes[SEQ_COUNTS] = 0 + + # Group genes and peaks by chromosome + + genes = {val: df for val, df in genes.groupby(feature_group_column)} + peaks = {val: df for val, df in peak_dataframe.groupby(peak_group_column)} + + chromosomes = set(genes.keys()).intersection(set(peaks.keys())) + + # Count overlaps on a per-chromosome basis + gene_counts = [] + for chromosome in chromosomes: + + # Function to return the number of overlaps with peaks in `chip_peaks` + # Iterates over genes from GTF data frame (using apply) + def _find_overlap(x): + start_bool = x[SEQ_START] <= peaks[chromosome][SEQ_STOP] + stop_bool = x[SEQ_STOP] >= peaks[chromosome][SEQ_START] + if sum(start_bool & stop_bool) == 0: + return 0 + selected_peaks = peaks[chromosome].loc[start_bool & stop_bool, :].copy() + selected_peaks.loc[selected_peaks[SEQ_START] < x[SEQ_START], SEQ_START] = x[SEQ_START] + selected_peaks.loc[selected_peaks[SEQ_STOP] > x[SEQ_STOP], SEQ_STOP] = x[SEQ_STOP] + return sum(selected_peaks[SEQ_STOP] - selected_peaks[SEQ_START]) + + # Add a chromosome column and then process into an integer peak count + genes[chromosome][feature_group_column] = chromosome + genes[chromosome][SEQ_COUNTS] = genes[chromosome].apply(_find_overlap, axis=1) + gene_counts.append(genes[chromosome]) + + # Combine all + gene_counts = pd.concat(gene_counts).reset_index().loc[:, [GTF_GENENAME, SEQ_COUNTS]] + + return gene_counts + + +def load_bed_to_dataframe(bed_file_path, **kwargs): + """ + :param bed_file_path: str + :return: pd.DataFrame + """ + + return pd.read_csv(bed_file_path, sep="\t", index_col=None, **kwargs) + + +def extract_bed_sequence(bed_file, genome_fasta, output_path=None): + output_path = tempfile.gettempdir() if output_path is None else output_path + output_file = os.path.join(output_path, os.path.split(genome_fasta)[1] + BEDTOOLS_EXTRACT_SUFFIX) + + if not isinstance(bed_file, pybedtools.BedTool): + bed_file = pybedtools.BedTool(bed_file) + + try: + bed_file.sequence(fi=genome_fasta, fo=output_file) + except pybedtools.helpers.BEDToolsError as pbe: + print(pbe.msg) + + return output_file + + +def load_bed_to_bedtools(bed): + if bed is None: + return None + elif isinstance(bed, pd.DataFrame): + return pybedtools.BedTool.from_dataframe(bed) + else: + return pybedtools.BedTool(bed) + + +def intersect_bed(*beds): + + if len(beds) == 1: + return beds[0] + + beds = [b.sort() for b in beds] + return beds[0].intersect(beds[1:], sorted=True) + + +def _merge_peaks_with_bedtools(merge_data, merge_columns, merge_function_names, max_distance=0): + """ + :param merge_data: pd.DataFrame + :param merge_columns: list(int) + :param merge_function_names: list(str) + :return: + """ + + assert len(merge_columns) == len(merge_function_names) + assert len(merge_columns) + 3 == merge_data.shape[1] + + # Load the data into a BedTool object + pbt_data = pybedtools.BedTool.from_dataframe(merge_data).sort() + + if len(merge_columns) > 0: + # Merge the overlapping peaks + pbt_data = pbt_data.merge(d=max_distance, c=merge_columns, o=merge_function_names).to_dataframe(max_distance) + else: + pbt_data = pbt_data.merge(d=max_distance).to_dataframe() + + pbt_data.columns = merge_data.columns + return pbt_data + diff --git a/inferelator_prior/processor/chroma.py b/inferelator_prior/processor/chroma.py new file mode 100644 index 0000000..a8a0e9d --- /dev/null +++ b/inferelator_prior/processor/chroma.py @@ -0,0 +1,13 @@ +import asyncio +import os +import sys + +from inferelator_prior.processor.utils import file_path_abs +from inferelator_prior import CHROMA_EXECUTABLE_PATH + +CHROMA_OUTPUT_FILE_EXTENSION = ".out.bed" +CHROMA_EXECUTABLE_CALL = [CHROMA_EXECUTABLE_PATH] + + +def chroma_process(): + pass diff --git a/inferelator_prior/processor/gtf.py b/inferelator_prior/processor/gtf.py new file mode 100644 index 0000000..81c3657 --- /dev/null +++ b/inferelator_prior/processor/gtf.py @@ -0,0 +1,144 @@ +import warnings +import pybedtools +import pandas as pd + +GENE_ID_REGEX = 'gene_id\s\"([A-Za-z0-9\.\-\(\)]+)\"\;' + +# Column names +GTF_ATTRIBUTES = 'attributes' +GTF_CHROMOSOME = 'seqname' +GTF_GENENAME = 'gene_name' +GTF_STRAND = 'strand' +SEQ_START = 'start' +SEQ_STOP = 'end' +SEQ_TSS = 'TSS' + + +def load_gtf_to_dataframe(gtf_path): + """ + Loads genes from a GTF into a dataframe and returns them + :param gtf_path: str + :return annotations: pd.DataFrame [N x 5] + 'gene_name': str + 'strand': str + 'start': int + 'end': int + 'seqname': str + """ + + # Load annotations into a dataframe with pybedtools + annotations = pybedtools.BedTool(gtf_path).to_dataframe() + + # Drop anything with NaNs which were probably comment lines + annotations = annotations.loc[~pd.isnull(annotations[SEQ_START]) & ~pd.isnull(annotations[SEQ_STOP]), :] + + # Regex extract the gene_id from the annotations column + annotations[GTF_GENENAME] = annotations[GTF_ATTRIBUTES].str.extract(GENE_ID_REGEX, expand=False) + + # Define genes as going from the minimum start for any subfeature to the maximum end for any subfeature + annotations = _fix_genes(annotations) + + # Fix chromosome names to always be strings + annotations[GTF_CHROMOSOME] = annotations[GTF_CHROMOSOME].astype(str) + + return _add_TSS(annotations) + + +def open_window(annotation_dataframe, window_size, use_tss=False, check_against_fasta=None): + """ + This needs to adjust the start and stop in the annotation dataframe with window sizes + :param annotation_dataframe: pd.DataFrame + :param window_size: int + :param use_tss: bool + :param check_against_fasta: + :return window_annotate: pd.DataFrame + """ + window_annotate = annotation_dataframe.copy() + + try: + if len(window_size) == 1: + w_up, w_down = window_size[0], window_size[0] + elif len(window_size) == 2: + w_up, w_down = window_size[0], window_size[1] + else: + raise ValueError("window_size must have 1 or 2 values only") + except TypeError: + w_up, w_down = window_size, window_size + + if use_tss: + window_annotate.loc[window_annotate[GTF_STRAND] == "+", SEQ_START] = window_annotate[SEQ_TSS] - w_up + window_annotate.loc[window_annotate[GTF_STRAND] == "+", SEQ_STOP] = window_annotate[SEQ_TSS] + w_down + window_annotate.loc[window_annotate[GTF_STRAND] == "-", SEQ_START] = window_annotate[SEQ_TSS] - w_down + window_annotate.loc[window_annotate[GTF_STRAND] == "-", SEQ_STOP] = window_annotate[SEQ_TSS] + w_up + else: + window_annotate.loc[window_annotate[GTF_STRAND] == "+", SEQ_START] = window_annotate[SEQ_START] - w_up + window_annotate.loc[window_annotate[GTF_STRAND] == "+", SEQ_STOP] = window_annotate[SEQ_STOP] + w_down + window_annotate.loc[window_annotate[GTF_STRAND] == "-", SEQ_START] = window_annotate[SEQ_START] - w_down + window_annotate.loc[window_annotate[GTF_STRAND] == "-", SEQ_STOP] = window_annotate[SEQ_STOP] + w_up + + window_annotate.loc[window_annotate[SEQ_START] < 0, SEQ_START] = 0 + + if check_against_fasta is not None: + fasta_len = {} + with open(check_against_fasta, mode="r") as fasta_fh: + current_record = None + for line in fasta_fh: + if line.startswith(">"): + current_record = line[1:].split()[0] + fasta_len[current_record] = 0 + else: + fasta_len[current_record] += len(line.strip()) + + _gtf_chromosomes = set(window_annotate[GTF_CHROMOSOME].unique()) + _fasta_chromsomes = set(fasta_len.keys()) + _gtf_fasta_match = _gtf_chromosomes.intersection(_fasta_chromsomes) + + if len(_gtf_fasta_match) != len(_gtf_chromosomes): + _msg = "GTF File Chromosomes {g} do not match FASTA File Chromosomes {f}\n" + _msg += "The following chromosomes will not map correctly: {ft}" + _msg = _msg.format(g=_gtf_chromosomes, + f=_fasta_chromsomes, + ft=_gtf_chromosomes.symmetric_difference(_fasta_chromsomes)) + warnings.warn(_msg) + + if len(_gtf_fasta_match) == 0: + raise ValueError("Unable to map FASTA and GTF chromosomes together") + + for chromosome in _gtf_fasta_match: + _chrlen = fasta_len[chromosome] + _idx = window_annotate[GTF_CHROMOSOME] == chromosome + window_annotate.loc[_idx & (window_annotate[SEQ_STOP] > _chrlen), SEQ_STOP] = _chrlen + window_annotate.loc[_idx & (window_annotate[SEQ_START] > _chrlen), SEQ_START] = _chrlen + + return window_annotate + + +def _fix_genes(gene_dataframe): + """ + Find minimum start and maximum stop + :param gene_dataframe: pd.DataFrame + :return: + """ + + # Make sure that the strandedness doesn't reverse start/stop + assert (gene_dataframe[SEQ_START] <= gene_dataframe[SEQ_STOP]).all() + + def _most_common(x): + return x.value_counts().index[0] + + # Define the functions for aggregating gene records + aggregate_functions = {SEQ_START: min, SEQ_STOP: max, GTF_CHROMOSOME: _most_common, GTF_STRAND: _most_common} + + return gene_dataframe.groupby("gene_name").aggregate(aggregate_functions).reset_index() + + +def _add_TSS(gene_dataframe): + """ + Add a TSS column in place + :param gene_dataframe: pd.DataFrame + :return: + """ + gene_dataframe[SEQ_TSS] = gene_dataframe[SEQ_START].copy() + rev_strand = gene_dataframe[GTF_STRAND] == "-" + gene_dataframe.loc[rev_strand, SEQ_TSS] = gene_dataframe.loc[rev_strand, SEQ_STOP].copy() + return gene_dataframe diff --git a/inferelator_prior/processor/htseq_count.py b/inferelator_prior/processor/htseq_count.py new file mode 100644 index 0000000..0346456 --- /dev/null +++ b/inferelator_prior/processor/htseq_count.py @@ -0,0 +1,88 @@ +import asyncio +import os +import sys + +from inferelator_prior import HTSEQ_MODULE_NAME +from inferelator_prior.processor.utils import file_path_abs + +HTSEQ_COUNT_FILE_EXTENSION = ".count.out" +HTSEQ_EXECUTABLE_CALL = [str(sys.executable), "-m", HTSEQ_MODULE_NAME] + + +def htseq_count_aligned(srr_ids, sam_file_names, annotation_file, output_path, num_workers=5): + """ + + :param srr_ids: list(str) + NCBI SRR ID string + :param sam_file_names: list(list(str)) + A list of complete SAM file names that were aligned from the FASTQ + :param annotation_file: str + A path to the GTF/GFF annotation file + :param output_path: str + The path to put the output alignment files + :param num_workers: int + Number of separate simultaneous jobs to run + :return: + """ + + sem = asyncio.Semaphore(num_workers) + + # Build HTseq.count tasks + tasks = [_htseq_count(sid, samfn, annotation_file, output_path, sem) + for sid, samfn in zip(srr_ids, sam_file_names)] + + # Run and return STAR tasks + return asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks)) + + +# TODO: test this +async def _htseq_count(srr_id, sam_file_name, annotation_file_name, output_path, semaphore): + """ + :param srr_id: str + NCBI SRR ID string + :param sam_file_name: str + Path to the aligned SAM file + :param annotation_file_name: str + A path to the GTF/GFF annotations + :param output_path: str + A path to the output + :param semaphore: asyncio.Semaphore + Semaphore for resource utilization + :return output_file: str + The path to the GeneCount file generated by HTCount + """ + + async with semaphore: + + if sam_file_name is None: + return None + + try: + os.makedirs(output_path) + except FileExistsError: + pass + + output_file = os.path.join(file_path_abs(output_path), srr_id + HTSEQ_COUNT_FILE_EXTENSION) + + if os.path.exists(output_file): + print("{id} countfile exists ({path})".format(id=srr_id, path=output_path)) + return output_file + + # Call out to an external interpreter + htseq_call = [*HTSEQ_EXECUTABLE_CALL, "--stranded=no", sam_file_name, annotation_file_name] + + print(" ".join(htseq_call)) + process = await asyncio.create_subprocess_exec(*htseq_call, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE) + + (output_data, output_err) = await process.communicate() + + if int(process.returncode) != 0: + print("HTSeq.count failed for {id} ({file})".format(id=srr_id, file=sam_file_name)) + return None + + with open(output_file, mode="w") as out_fh: + print(output_data.decode('utf-8'), file=out_fh) + + return output_file diff --git a/inferelator_prior/processor/matrix.py b/inferelator_prior/processor/matrix.py new file mode 100644 index 0000000..2856221 --- /dev/null +++ b/inferelator_prior/processor/matrix.py @@ -0,0 +1,176 @@ +import HTSeq +import pandas as pd + +INDEX_NAME = "gene" +COUNT_COLUMN = "count" + +META_STARTSWITH_FLAG = "__" +META_ALIGNED_COUNTS = "aligned_feature_sum" + +TRANSCRIPT_TYPE_FLAG = "exon" + + +# Turn count files into a count matrix +# TODO: test this +def pileup_raw_counts(srr_ids, count_files): + """ + Convert the HTSeq count files to a dataframe of SRR-derived expression values + + :param srr_ids: list(str) + NCBI SRR ID string + :param count_files: list(str) + A list of HTSeq count files + :return matrix_data: pd.DataFrame [Genes x Samples] + A dataframe of raw, unnormalized count values from all SRR alignments + """ + matrix_data = pd.DataFrame() + meta_data = [] + for srr_id, count_file_name in zip(srr_ids, count_files): + + if count_file_name is None: + continue + + # Load in the count data + count_data = pd.read_csv(count_file_name, sep="\t", index_col=0, header=None) + count_data.index.name = INDEX_NAME + count_data.columns = [COUNT_COLUMN] + + # Find the metadata + count_metadata_indexes = count_data.index[count_data.index.str.startswith(META_STARTSWITH_FLAG)] + + # Process metadata + count_meta_data = count_data.loc[count_metadata_indexes, :].rename(columns={COUNT_COLUMN: srr_id}).transpose() + count_meta_data.columns = count_meta_data.columns.str.strip(META_STARTSWITH_FLAG) + + # Remove metadata from count dataframe + count_data = count_data.drop(count_metadata_indexes) + + # Make sure that the pileup matrix has all the genes + if len(count_data.index.difference(matrix_data.index)) > 0: + new_index = matrix_data.index.union(count_data.index) + matrix_data = matrix_data.reindex(new_index) + matrix_data[pd.isna(matrix_data)] = 0 + + # Make sure that the count data is aligned to the matrix data + if not count_data.index.equals(matrix_data.index): + count_data = count_data.reindex(matrix_data.index) + count_data[pd.isna(count_data)] = 0 + + # Stick the count data onto the data frame + count_data = count_data.reindex(matrix_data.index) + matrix_data[srr_id] = count_data[COUNT_COLUMN].astype(int) + + # Add the total counts to the metadata + count_meta_data[META_ALIGNED_COUNTS] = count_data[COUNT_COLUMN].sum() + meta_data.append(count_meta_data) + + # Combine the meta_data into a single dataframe + meta_data = pd.concat(meta_data) + + return matrix_data, meta_data + + +# Turn a raw read count into a normalized RPKM / FPKM per gene +def normalize_matrix_to_fpkm(matrix_data, annotation_file): + """ + Convert a raw count dataframe to a library and gene size normalized dataframe (RPKM / FPKM) + + :param matrix_data: pd.DataFrame [Genes x Samples] + Dataframe of raw counts per gene + :param annotation_file: str + Path to the genome annotation (GTF) file + :return normalized_matrix: pd.DataFrame [Genes x Samples] + Normalized dataframe (FPKM) + """ + + gene_lengths = load_gene_lengths(annotation_file) + + diff = matrix_data.index.difference(gene_lengths.index) + if len(diff) > 0: + print("Dropping genes with unknown lengths: {genes}".format(genes=" ".join(diff.tolist()))) + + normalized_matrix = matrix_data.drop(diff, axis=0) + + # Normalize the libraries by read depth to counts per million reads + normalized_matrix = normalized_matrix.divide(normalized_matrix.sum()) * 1e6 + + # Normalize the libraries by gene length to counts per kilobase per million reads + normalized_matrix = normalized_matrix.divide(gene_lengths['length'], axis=0) + + return normalized_matrix + + +# Turn a raw read count into a normalized TPM per gene +def normalize_matrix_to_tpm(matrix_data, annotation_file): + """ + Convert a raw count dataframe to a library and gene size normalized dataframe (TPM) + + :param matrix_data: pd.DataFrame [Genes x Samples] + Dataframe of raw counts per gene + :param annotation_file: str + Path to the genome annotation (GTF) file + :return normalized_matrix: pd.DataFrame [Genes x Samples] + Normalized dataframe (TPM) + """ + + gene_lengths = load_gene_lengths(annotation_file) + + diff = matrix_data.index.difference(gene_lengths.index) + if len(diff) > 0: + print("Dropping genes with unknown lengths: {genes}".format(genes=" ".join(diff.tolist()))) + + # Align data + normalized_matrix = matrix_data.drop(diff, axis=0) + gene_lengths = gene_lengths.reindex(normalized_matrix.index) + + # Normalize the libraries by gene length to counts per kilobase + normalized_matrix = normalized_matrix.divide(gene_lengths['length'], axis=0) + + # Normalize the libraries by scaling to the library size + normalized_matrix = normalized_matrix.divide(normalized_matrix.sum()) * 1e6 + + return normalized_matrix + + +def load_gene_lengths(annotation_file): + """ + Load gene lengths from an annotation file + + :param annotation_file: str + Path to the genome annotation (GTF) file + :return gene_lengths: pd.DataFrame[G x 1] + Dataframe indexed by gene name + + ========== ======= ============================================================== + length int sum of exon length in kilobases + ========== ======= ============================================================== + + """ + + # Load a GFF reader from HTSeq + gff_reader = HTSeq.GFF_Reader(annotation_file) + + # Get exons for each gene + gene_lengths = {} + for gf in gff_reader: + if gf.type == TRANSCRIPT_TYPE_FLAG: + try: + gene_lengths[gf.name].append(_gene_length(gf)) + except KeyError: + gene_lengths[gf.name] = [_gene_length(gf)] + + # Sum exon lengths and pack into a dataframe in kilobases + gene_lengths = pd.DataFrame.from_dict({gn: sum(exons) / 10e3 for gn, exons in gene_lengths.items()}, + orient='index', columns=['length']) + + return gene_lengths + +def _gene_length(htseq_genomic_feature): + """ + Get feature length + :param htseq_genomic_feature: HTSeq.GenomeFeature + GenomeFeature from a GFF_Reader iterable + :return: int + Feature length + """ + return abs(htseq_genomic_feature.iv.start - htseq_genomic_feature.iv.end) diff --git a/inferelator_prior/processor/prior.py b/inferelator_prior/processor/prior.py new file mode 100644 index 0000000..334ad1d --- /dev/null +++ b/inferelator_prior/processor/prior.py @@ -0,0 +1,382 @@ +from inferelator_prior.processor.gtf import GTF_GENENAME, GTF_CHROMOSOME, SEQ_START, SEQ_STOP +from inferelator_prior.motifs.motif_scan import MotifScan +from inferelator_prior.motifs import INFO_COL, MOTIF_COL, LEN_COL, SCAN_SCORE_COL, MOTIF_NAME_COL, SCORE_PER_BASE + +import pandas as pd +import pandas.api.types as pat +import numpy as np +import pathos.multiprocessing as multiprocessing +from sklearn.cluster import DBSCAN +from sklearn.neighbors import LocalOutlierFactor +from sklearn.covariance import EllipticEnvelope +from scipy.ndimage.filters import uniform_filter1d + +PRIOR_TF = 'regulator' +PRIOR_GENE = 'target' +PRIOR_COUNT = 'count' +PRIOR_SCORE = 'score' +PRIOR_MOTIF_IC = 'motif_ic' +PRIOR_PVAL = 'pvalue' +PRIOR_SEQ = 'sequence' +PRIOR_START = 'start' +PRIOR_STOP = 'stop' +PRIOR_CHR = 'chromosome' + +PRIOR_COLS = [PRIOR_TF, PRIOR_GENE, PRIOR_COUNT, PRIOR_SCORE, PRIOR_MOTIF_IC, PRIOR_START, PRIOR_STOP, PRIOR_CHR] + +PRIOR_FDR = 'qvalue' +PRIOR_SIG = 'significance' + +MINIMUM_MOTIF_IC_BITS = None +MAXIMUM_TANDEM_DISTANCE = 100 + + +class MotifScorer: + min_binding_ic = MINIMUM_MOTIF_IC_BITS + max_dist = MAXIMUM_TANDEM_DISTANCE + + @classmethod + def set_information_criteria(cls, min_binding_ic=None, max_dist=None): + """ + Set parameters for + :param min_binding_ic: + :param max_dist: + :return: + """ + cls.min_binding_ic = cls.min_binding_ic if min_binding_ic is None else min_binding_ic + cls.max_dist = cls.max_dist if max_dist is None else max_dist + + @classmethod + def score_tf(cls, tf_motifs): + """ + Score a single TF + :param tf_motifs: Motif binding sites from FIMO/HOMER + :type tf_motifs: pd.DataFrame + :return: Score if the TF should be kept, None otherwise + """ + + assert isinstance(tf_motifs, pd.DataFrame) + + # Drop sites that don't meet threshold + if cls.min_binding_ic is not None: + tf_motifs = tf_motifs.loc[tf_motifs[SCAN_SCORE_COL] >= cls.min_binding_ic, :] + n_sites = tf_motifs.shape[0] + + # If there's no data return None + if n_sites == 0: + return None + + # Sort and check for overlapping motifs + tf_motifs = tf_motifs.sort_values(by=MotifScan.start_col) + overlap = tf_motifs[MotifScan.start_col] < tf_motifs[MotifScan.stop_col].shift() + + # Collapse together any overlapping motifs to the maximum score on a per-base basis + if overlap.any(): + + tf_motifs["GROUP"] = (~overlap).cumsum() + tf_motifs = pd.concat([cls._agg_per_base(group) for _, group in tf_motifs.groupby("GROUP")]) + + n_sites = tf_motifs.shape[0] + + # If there's only one site check it and then return + if n_sites == 1: + return cls._top_hit(tf_motifs) + + # If there's only two sites check it and then return + if n_sites == 2: + consider_tandem = tf_motifs.iloc[0, :][MotifScan.stop_col] - tf_motifs.iloc[1, :][MotifScan.start_col] + if consider_tandem > cls.max_dist: + return cls._top_hit(tf_motifs) + else: + start = tf_motifs.iloc[0, :][MotifScan.start_col] + stop = tf_motifs.iloc[1, :][MotifScan.stop_col] + score = tf_motifs[SCAN_SCORE_COL].sum() + return score, 2, start, stop + + # If there's more than two sites do the complicated tandem checking stuff + else: + # Find things that are in tandems + consider_tandem = (tf_motifs[MotifScan.stop_col] - tf_motifs[MotifScan.start_col].shift(1)) + consider_tandem = consider_tandem <= cls.max_dist + + # Skip the rest if nothing is close enough to matter + if not consider_tandem.any(): + return cls._top_hit(tf_motifs) + + # Ffill the tandem group to have the same start + tandem_starts = tf_motifs[MotifScan.start_col].copy() + tandem_starts.loc[consider_tandem] = pd.NA + tandem_starts = tandem_starts.ffill() + + # Backfill the tandem group to have the same stop + tandem_stops = tf_motifs[MotifScan.stop_col].copy() + tandem_stops.loc[consider_tandem.shift(-1, fill_value=False)] = pd.NA + tandem_stops = tandem_stops.bfill() + + # Concat, group by start/stop, and then sum IC scores + tandem_peaks = pd.concat([tandem_starts, tandem_stops, tf_motifs[SCAN_SCORE_COL]], axis=1) + tandem_peaks.columns = [PRIOR_START, PRIOR_STOP, PRIOR_SCORE] + tandem_peaks = tandem_peaks.groupby(by=[PRIOR_START, PRIOR_STOP]).agg('sum').reset_index() + + # Return the highest tandem array group + peak = tandem_peaks.loc[tandem_peaks[PRIOR_SCORE].argmax(), :] + return peak[PRIOR_SCORE], peak.shape[0], peak[PRIOR_START], peak[PRIOR_STOP] + + @classmethod + def preprocess_motifs(cls, gene_motif_data, motif_information): + if cls.min_binding_ic is not None: + motif_information = motif_information.loc[motif_information[INFO_COL] >= cls.min_binding_ic, :] + keeper_motifs = motif_information[MOTIF_COL].unique().tolist() + keeper_idx = (gene_motif_data[MotifScan.name_col].isin(keeper_motifs)) + keeper_idx &= (gene_motif_data[SCAN_SCORE_COL] >= cls.min_binding_ic) + + return gene_motif_data.loc[keeper_idx, :], motif_information + else: + return gene_motif_data, motif_information + + @staticmethod + def _top_hit(tf_motifs): + if tf_motifs.shape[0] == 0: + return None + elif tf_motifs.shape[0] == 1: + top_hit = tf_motifs.iloc[0, :] + else: + top_hit = tf_motifs.iloc[tf_motifs[SCAN_SCORE_COL].values.argmax(), :] + + start = MotifScorer._first_value(top_hit[MotifScan.start_col]) + stop = MotifScorer._first_value(top_hit[MotifScan.stop_col]) + score = MotifScorer._first_value(top_hit[SCAN_SCORE_COL]) + return score, 1, start, stop + + @staticmethod + def _first_value(series): + try: + return series.iloc[0] + except AttributeError: + return series + + @classmethod + def _agg_per_base(cls, overlap_df): + """ + Aggregate an overlapping set of motif peaks by summing the maximum per-base IC for each base + :param overlap_df: + :return: + """ + if len(overlap_df) == 1: + return overlap_df[[MotifScan.start_col, MotifScan.stop_col, SCAN_SCORE_COL, MOTIF_NAME_COL]] + + overlap_df.reset_index(inplace=True) + + new_df = [(a, b) for i in overlap_df.index for a, b in zip(range(overlap_df.loc[i, MotifScan.start_col], + overlap_df.loc[i, MotifScan.stop_col]), + overlap_df.loc[i, SCORE_PER_BASE])] + + return pd.DataFrame({MotifScan.start_col: [overlap_df[MotifScan.start_col].min()], + MotifScan.stop_col: [overlap_df[MotifScan.stop_col].max()], + SCAN_SCORE_COL: pd.DataFrame(new_df, columns=["B", "S"]).groupby("B").agg('max').sum(), + MOTIF_NAME_COL: [overlap_df[MOTIF_NAME_COL].unique()[0]]}) + + +def build_prior_from_atac_motifs(genes, motif_peaks, motif_information, num_workers=1, seed=42): + """ + Construct a prior [G x K] interaction matrix + :param genes: pd.DataFrame [G x n] + :param motif_peaks: pd.DataFrame + Motif search data loaded from FIMO or HOMER + :param motif_information: pd.DataFrame [n x 5] + Motif characteristics loaded from a MEME file + :return prior_data, prior_matrix: pd.DataFrame [G*K x 6], pd.DataFrame [G x K] + A long-form edge table data frame and a wide-form interaction matrix data frame + """ + + motif_ids = motif_information[MOTIF_COL].unique() + print("Building prior from {g} genes and {k} Motifs".format(g=genes.shape[0], k=len(motif_ids))) + + motif_peaks, motif_information = MotifScorer.preprocess_motifs(motif_peaks, motif_information) + print("Preliminary search identified {n} binding sites".format(n=motif_peaks.shape[0])) + + # Trim down the motif dataframe and put it into a dict by chromosome + motif_peaks = motif_peaks.reindex([MotifScan.name_col, MotifScan.chromosome_col, MotifScan.start_col, + MotifScan.stop_col, SCAN_SCORE_COL, SCORE_PER_BASE], axis=1) + + motif_id_to_name = motif_information.reindex([MOTIF_COL, MOTIF_NAME_COL], axis=1) + invalid_names = (pd.isnull(motif_id_to_name[MOTIF_NAME_COL]) | + (motif_id_to_name[MOTIF_NAME_COL] == "") | + (motif_id_to_name is None)) + + motif_id_to_name.loc[invalid_names, MOTIF_NAME_COL] = motif_id_to_name.loc[invalid_names, MOTIF_COL] + motif_peaks = motif_peaks.join(motif_id_to_name.set_index(MOTIF_COL, verify_integrity=True), on=MotifScan.name_col) + motif_names = motif_information[MOTIF_NAME_COL].unique() + + motif_peaks = {chromosome: df for chromosome, df in motif_peaks.groupby(MotifScan.chromosome_col)} + + def _prior_mapper(data): + i, gene_data, motifs = data + return _build_prior_for_gene(gene_data, motifs, motif_information, i) + + if num_workers == 1: + prior_data = list(map(_prior_mapper, _gene_gen(genes, motif_peaks))) + else: + with multiprocessing.Pool(num_workers, maxtasksperchild=1000) as pool: + prior_data = pool.map(_prior_mapper, _gene_gen(genes, motif_peaks), chunksize=20) + + # Combine priors for all genes + prior_data = pd.concat(prior_data).reset_index(drop=True) + prior_data[PRIOR_START] = prior_data[PRIOR_START].astype(int) + prior_data[PRIOR_STOP] = prior_data[PRIOR_STOP].astype(int) + + np.random.seed(seed) + + # Pivot to a matrix, extend to all TFs, and fill with 0s + raw_matrix = prior_data.pivot(index=PRIOR_GENE, columns=PRIOR_TF, values=PRIOR_SCORE) + raw_matrix = raw_matrix.reindex(motif_names, axis=1).reindex(genes[GTF_GENENAME], axis=0).fillna(0) + raw_matrix.index.name = PRIOR_GENE + + prior_matrix = raw_matrix.copy() + # Threshold using DBSCAN outlier detection + for reg in prior_matrix.columns: + prior_matrix.loc[~_find_outliers_lof(prior_matrix[reg]), reg] = 0. + + # Keep the peaks that we want + thresholded_data = prior_matrix.reset_index().melt(id_vars=PRIOR_GENE, var_name=PRIOR_TF, value_name='T') + thresholded_data = prior_data.merge(thresholded_data, on=[PRIOR_GENE, PRIOR_TF]) + thresholded_data = thresholded_data.loc[thresholded_data['T'] != 0, :] + thresholded_data.drop('T', axis=1, inplace=True) + + return thresholded_data, prior_matrix, raw_matrix + + +def _gene_gen(genes, motif_peaks): + for i, (idx, gene_data) in enumerate(genes.iterrows()): + try: + gene_chr, gene_start, gene_stop = gene_data[GTF_CHROMOSOME], gene_data[SEQ_START], gene_data[SEQ_STOP] + + motif_data = motif_peaks[gene_data[GTF_CHROMOSOME]] + motif_mask = motif_data[MotifScan.stop_col] >= gene_start + motif_mask &= motif_data[MotifScan.start_col] <= gene_stop + motif_data = motif_data.loc[motif_mask, :].copy() + yield i, gene_data, motif_data + except KeyError: + continue + + +def _find_outliers_dbscan(tf_data, t_1=0.01, t_2=0.05): + scores = tf_data.values.reshape(-1, 1) + counts = tf_data.shape[0] + + labels = DBSCAN(min_samples=np.log2(counts), eps=scores.max() / 100).fit_predict(scores) + + # Keep any outliers (outliers near 0 should be discarded) + keep_edge = pd.Series((labels == -1) & (tf_data.values > np.mean(scores)), index=tf_data.index) + + # Iterate through clusters in reverse order until at least t_1 and no more than t_2 edges are included + for lab in np.unique(labels)[::-1]: + current_ratio = keep_edge.sum() / keep_edge.size + new_labels = labels == lab + if current_ratio > t_1: + break + elif current_ratio + (new_labels.sum() / new_labels.size) > t_2: + break + else: + keep_edge |= new_labels + + return keep_edge + + +def _find_outliers_elliptic_envelope(tf_data, outlier=2.5, skip_threshold=0.002): + + scores = tf_data.values + keep_genes = pd.Series(False, index=tf_data.index) + + if np.var(scores) == 0.: + return keep_genes + + _nz_idx = scores > 0 + + if skip_threshold is not None and np.sum(_nz_idx) < (len(keep_genes) * skip_threshold): + return keep_genes | _nz_idx + + # Calculate Mahalanobis distance + _nzs = scores[_nz_idx].reshape(-1, 1) + + if np.var(_nzs) == 0.: + return keep_genes + + # Correct for just the dumbest bug in scipy + if (np.mean(_nzs) == np.median(_nzs)) & (np.sum(_nzs == np.median(_nzs)) > (0.5 * _nzs.size)): + keep_genes[_nz_idx] = (_nzs >= np.median(_nzs)).flatten() + return keep_genes + + m_dist = EllipticEnvelope(support_fraction=1).fit(_nzs).score_samples(_nzs) + scaled_m_dist = (m_dist - np.mean(m_dist)) / np.std(m_dist) + keep_genes[_nz_idx] = scaled_m_dist < (-1 * outlier) + + return keep_genes + + +def _find_outliers_lof(tf_data, skip_threshold=0.002): + scores = tf_data[tf_data > 0] + keep_genes = pd.Series(False, index=tf_data.index) + + if np.var(scores) == 0.: + return keep_genes + + if skip_threshold is not None and scores.size < (tf_data.size * skip_threshold): + keep_genes[scores.index] = True + return keep_genes + + labels = LocalOutlierFactor(n_neighbors=max(int(scores.size * 0.01), 20), + algorithm='brute', p=1).fit_predict(scores.values.reshape(-1, 1)) + keep_genes[scores.index] = (labels == -1) & (scores.values > np.mean(scores)) + return keep_genes + + +def _build_prior_for_gene(gene_info, motif_data, motif_information, num_iteration): + """ + Takes motifs identified by scan near a single gene and turns them into TF-gene scores + + :param gene_info: Gene information from annotations + :type gene_info: pd.DataFrame + :param motif_data: Motif locations near the gene + :type motif_data: pd.DataFrame + :param motif_information: Motif information + :type motif_information: pd.DataFrame + :param num_iteration: Number of genes which have been processed + :type num_iteration: int + :return prior_edges: pd.DataFrame [N x 5] + 'regulator': tf name + 'target': gene name + 'count': number of motifs found + 'score': information content-based score of binding site + 'motif_ic': information content score of motif + 'start': binding site start + 'stop': binding site stop + 'chromosome' binding site chromosome + """ + + gene_name = gene_info[GTF_GENENAME] + gene_chr, gene_start, gene_stop = gene_info[GTF_CHROMOSOME], gene_info[SEQ_START], gene_info[SEQ_STOP] + + if num_iteration % 100 == 0: + print("Processing gene {i} [{gn}]".format(i=num_iteration, gn=gene_name)) + + if min(motif_data.shape) == 0: + return pd.DataFrame(columns=PRIOR_COLS) + + prior_edges = [] + for tf, tf_peaks in motif_data.groupby(MOTIF_NAME_COL): + tf_info = motif_information.loc[motif_information[MOTIF_NAME_COL] == tf, :] + res = MotifScorer.score_tf(tf_peaks) + + # Unpack results if there is a hit + if res is None: + continue + else: + score, tf_counts, start, stop = res + + info = tf_info[INFO_COL].mean() if tf_info.shape[0] > 0 else np.nan + + # Add this edge to the table + prior_edges.append((tf, gene_name, tf_counts, score, info, start, stop, gene_chr)) + + return pd.DataFrame(prior_edges, columns=PRIOR_COLS) diff --git a/inferelator_prior/processor/samtools.py b/inferelator_prior/processor/samtools.py new file mode 100644 index 0000000..1e64fbb --- /dev/null +++ b/inferelator_prior/processor/samtools.py @@ -0,0 +1,83 @@ +import asyncio +import os + +from inferelator_prior import SAMTOOLS_EXECUTABLE_PATH + +BAM_EXTENSION = ".bam" + + +def sam_sort(srr_ids, sam_files, target_path, min_quality=None, num_workers=5): + """ + Sort (and filter) SAM files into BAM files + + :param srr_ids: list(str) + List of SRA IDs to acquire from NCBI + :param sam_files: list(str) + List of SAM file paths + :param target_path: str + Target path for the SRA files + :param min_quality: int + If set, filter reads for MINQ + :param num_workers: int + Number of concurrent jobs to run + :return: + """ + + sem = asyncio.Semaphore(num_workers) + + tasks = [_process_sam(sid, sfn, target_path, sem, min_quality=min_quality) for sid, sfn in zip(srr_ids, sam_files)] + + return asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks)) + + +async def _process_sam(srr_id, sam_file, target_path, semaphore, min_quality=None): + """ + Sort (and filter) SAM file into BAM file + :param srr_id: str + NCBI SRR ID string + :param sam_file: + The complete path to the SAM file + :param target_path: str + The path to put the BAM file + :param semaphore: asyncio.Semaphore + Semaphore for resource utilization + :param min_quality: int + Minimum alignment quality score to include (None disables filter) + :return bam_file_name: str + Path to the created BAM file name + """ + async with semaphore: + + if sam_file is None: + return None + + bam_file_name = os.path.join(target_path, srr_id + BAM_EXTENSION) + + samtools_sort_call = [SAMTOOLS_EXECUTABLE_PATH] + sort_cmd = ["sort", "-o", bam_file_name] + + # If min_quality is set, pipe in a view with a -q flag set + if min_quality is not None: + samtools_sort_call.extend(["view", "-q", str(min_quality), sam_file, "|", + SAMTOOLS_EXECUTABLE_PATH, *sort_cmd, "-"]) + else: + samtools_sort_call.extend([*sort_cmd, sam_file]) + + # Create a sorted BAM file + try: + print(" ".join(samtools_sort_call)) + process = await asyncio.create_subprocess_exec(*samtools_sort_call) + code = await process.wait() + except: + code = 1 + raise + finally: + if int(code) != 0: + print("samtools sort failed for {id} ({file})".format(id=srr_id, file=bam_file_name)) + try: + os.remove(bam_file_name) + except FileNotFoundError: + pass + return None + + return bam_file_name diff --git a/inferelator_prior/processor/srr.py b/inferelator_prior/processor/srr.py new file mode 100644 index 0000000..0625081 --- /dev/null +++ b/inferelator_prior/processor/srr.py @@ -0,0 +1,173 @@ +import asyncio +import os + +from inferelator_prior.processor.utils import file_path_abs +from inferelator_prior import FASTQDUMP_EXECUTABLE_PATH, PREFETCH_EXECUTABLE_PATH + +PREFETCH_OPTIONS = ["--max-size", "1000000000"] + +SRA_EXTENSION = ".sra" +POSSIBLE_FASTQ_EXTENSIONS = [".fastq.gz", "_1.fastq.gz", "_2.fastq.gz", "_3.fastq.gz", "_4.fastq.gz"] + + +# TODO: test this +def get_srr_files(srr_list, target_path, num_workers=5, prefetch_options=PREFETCH_OPTIONS): + """ + Take a list of SRR ID strings, download them async with num_workers concurrent jobs, and return a list of the + paths to the SRR files that have been downloaded. + :param srr_list: list(str) + List of SRA IDs to acquire from NCBI + :param target_path: str + Target path for the SRA files + :param num_workers: int + Number of concurrent jobs to run + :param prefetch_options: list(str) + Any additional command line arguments to pass to prefetch + :return: + """ + sem = asyncio.Semaphore(num_workers) + + srr_file_names = list(map(lambda x: os.path.join(file_path_abs(target_path), x + SRA_EXTENSION), srr_list)) + tasks = [_get_srr(sid, sfn, sem, prefetch_options=prefetch_options) for sid, sfn in zip(srr_list, srr_file_names)] + + try: + return asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks)) + except RuntimeError: + return asyncio.new_event_loop().run_until_complete(asyncio.gather(*tasks)) + + +# TODO: test this +async def _get_srr(srr_id, srr_file_name, semaphore, prefetch_options=PREFETCH_OPTIONS): + """ + Take a SRR ID string and get the SRR file for it from NCBI. + + :param srr_id: str + NCBI SRR ID string + :param srr_file_name: str + The path to the SRR file (the FULL path) + :param semaphore: asyncio.Semaphore + Semaphore for resource utilization + :param prefetch_options: list(str) + Any additional command line arguments to pass to prefetch + :return srr_file_name: str + The SRR file name (including path) + """ + async with semaphore: + # If the file is already downloaded, don't do anything + if os.path.exists(srr_file_name): + print("{id} exists in file {file}".format(id=srr_id, file=srr_file_name)) + return srr_file_name + + prefetch_call = [PREFETCH_EXECUTABLE_PATH, srr_id, "-o", srr_file_name, *prefetch_options] + print(" ".join(prefetch_call)) + process = await asyncio.create_subprocess_exec(*prefetch_call) + code = await process.wait() + + if int(code) != 0: + print("NCBI Prefetch failed for {id} ({file})".format(id=srr_id, file=srr_file_name)) + return None + + return srr_file_name + + +# TODO: test this +def unpack_srr_files(srr_ids, srr_file_names, target_path, num_workers=5): + """ + Take an SRR file and unpack it into a set of FASTQ files + + :param srr_ids: list(str) + NCBI SRR ID string + :param srr_file_names: list(str) + The complete path to the SRR file + :param target_path: str + The path to put the FASTQ file(s) + :param num_workers: int + Number of concurrent jobs to run + :return fastq_file_names: list + A list of complete FASTQ file names that were unpacked from the SRR file (including path) + """ + + sem = asyncio.Semaphore(num_workers) + + tasks = [_unpack_srr(sid, sfn, target_path, sem) for sid, sfn in zip(srr_ids, srr_file_names)] + return asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks)) + + +# TODO: test this +async def _unpack_srr(srr_id, srr_file_name, target_path, semaphore): + """ + + :param srr_id: str + NCBI SRR ID string + :param srr_file_name: str + The complete path to the SRR file + :param target_path: str + The path to put the FASTQ file(s) + :param semaphore: asyncio.Semaphore + Semaphore for resource utilization + :return: + """ + async with semaphore: + + if srr_file_name is None: + return [None] + + # Check and see if this has already been done + output_file_names = list(map(lambda x: os.path.join(file_path_abs(target_path), srr_id + x), + POSSIBLE_FASTQ_EXTENSIONS)) + files_created = check_list_of_files_exist(output_file_names) + + # If the file is already unpacked, don't do anything + if len(files_created) > 0: + print("{id} exists in path {path} ({files})".format(id=srr_id, path=target_path, + files=" ".join(files_created))) + return files_created + + # Build a fastq-dump call and execute it + fastq_dump_call = [FASTQDUMP_EXECUTABLE_PATH, "--gzip", "--split-files", "--outdir", target_path, + srr_file_name] + + print(" ".join(fastq_dump_call)) + + # Run fastq-dump and get the files that were created from it + return_code = 0 + try: + process = await asyncio.create_subprocess_exec(*fastq_dump_call) + return_code = await process.wait() + file_output = check_list_of_files_exist(output_file_names) + except: + return_code = 1 + file_output = [None] + raise + finally: + # If the fastq-dump failed, clean up the files associated with it and then move on + if int(return_code) != 0: + print("NCBI fastq-dump failed for {id} ({file})".format(id=srr_id, file=srr_file_name)) + files_created = check_list_of_files_exist(output_file_names) + for f in files_created: + try: + os.remove(f) + except FileNotFoundError: + pass + file_output = [None] + + # Find out which read files were created by looking into the output folder + return file_output + + +def check_list_of_files_exist(file_list): + """ + Check a list of file names and return subset of the list that exists (or an empty list if none exist) + :param file_list: list(str) + List of file names + :return existing_file_list: list(str) + List of files that exist + """ + + existing_file_list = [] + + for file_name in file_list: + if os.path.exists(file_path_abs(file_name)): + existing_file_list.append(file_name) + + return existing_file_list diff --git a/inferelator_prior/processor/star.py b/inferelator_prior/processor/star.py new file mode 100644 index 0000000..6b8a9fa --- /dev/null +++ b/inferelator_prior/processor/star.py @@ -0,0 +1,218 @@ +import asyncio +import os +import shutil +import subprocess + +import numpy as np + +from inferelator_prior.processor.utils import get_file_from_url, file_path_abs, get_genome_file_locs +from inferelator_prior import STAR_EXECUTABLE_PATH + +STAR_COUNT_FILE_NAME = "ReadsPerGene.out.tab" +STAR_ALIGNMENT_FILE_NAME = "Aligned.out.sam" +STAR_COUNT_FILE_METAINDEXES = ["N_unmapped", "N_multimapping", "N_noFeature", "N_ambiguous"] +STAR_COUNT_FILE_HEADER = ["Total", "MinusStrand", "PlusStrand"] +STAR_COUNT_COLUMN = "Total" + +STAR_DEFAULT_MKREF_OPTIONS = [] +STAR_DEFAULT_COUNT_OPTIONS = [] + + +# TODO: test this +def star_align_fastqs(srr_ids, fastq_file_names, reference_genome, output_path, num_workers=4, threads_per_worker=5, + star_options=STAR_DEFAULT_COUNT_OPTIONS): + """ + Take a set of FASTQ files and align them with the STAR aligner + + :param srr_ids: list(str) + NCBI SRR ID string + :param fastq_file_names: list(list(str)) + A list of complete FASTQ file names that were unpacked from the SRR file (including path) + :param reference_genome: str + A path to the STAR reference genome that was preassembled + :param output_path: str + The path to put the output alignment files + :param num_workers: int + Number of separate simultaneous jobs to run + :param threads_per_worker: int + Number of threads to assign to each job in STAR (--runThreadN) + :param star_options: list(str) + A list of options to pass to the STAR aligner + :return sam_file_names: list(str) + The SAM alignment files generated by STAR (including path) + """ + + sem = asyncio.Semaphore(num_workers) + + # Build output paths for STAR from SRR ids + output_paths = list(map(lambda x: os.path.join(output_path, x, ''), srr_ids)) + + # Build STAR tasks + tasks = [_star_align(sid, fqfn, reference_genome, sout, sem, + threads_per_worker=threads_per_worker, star_options=star_options) + for sid, fqfn, sout in zip(srr_ids, fastq_file_names, output_paths)] + + # Run and return STAR tasks + return asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks)) + + +# TODO: test this +async def _star_align(srr_id, fastq_file_names, reference_genome, output_path, semaphore, + threads_per_worker=5, star_options=STAR_DEFAULT_COUNT_OPTIONS): + """ + Align an individual set of FASTQs from an SRA to the reference genome + :param srr_id: str + NCBI SRR ID string + :param fastq_file_names: list(str) + A list of FASTQ files for the SRR ID + :param reference_genome: str + A path to the STAR reference genome + :param output_path: str + A path to the output + :param semaphore: asyncio.Semaphore + Semaphore for resource utilization + :param threads_per_worker: int + Number of threads to assign to each job in STAR (--runThreadN) + :param star_options: list(str) + A list of options to pass to the STAR aligner + :return output_file: str + The path to the SAM file generated by STAR + """ + async with semaphore: + + if fastq_file_names[0] is None: + return None + + try: + os.makedirs(output_path) + except FileExistsError: + pass + + output_file = os.path.join(file_path_abs(output_path), STAR_ALIGNMENT_FILE_NAME) + + if os.path.exists(output_file): + print("{id} SAM alignment file exists ({path})".format(id=srr_id, path=output_path)) + return output_file + + # Build the STAR executable call + star_call = [STAR_EXECUTABLE_PATH, + "--runThreadN", str(threads_per_worker), + "--runMode", "alignReads", + "--readFilesCommand", "zcat", + "--genomeDir", reference_genome, + "--outFileNamePrefix", os.path.join(file_path_abs(output_path), ''), + "--readFilesIn", *fastq_file_names, + "--outFilterType", "BySJout"] + + # Add in any additional options + star_call.extend(star_options) + + print(" ".join(star_call)) + process = await asyncio.create_subprocess_exec(*star_call) + code = await process.wait() + + if int(code) != 0: + print("STAR failed for {id} ({files})".format(id=srr_id, files=" ".join(fastq_file_names))) + return None + + return output_file + + +# TODO: test this +def star_mkref(output_path, genome_file=None, annotation_file=None, default_genome=None, + star_options=STAR_DEFAULT_MKREF_OPTIONS, cores=1, gff_annotations=None, + star_executable=STAR_EXECUTABLE_PATH, move_files=True): + """ + Make a reference genome index for STAR to align reads to + :param output_path: str + Path to output reference index into + :param genome_file: list(str) + Genome sequences (usually FASTA) + :param annotation_file: str + Annotation file (usually GTF or GFF) + :param default_genome: str + A string to identify one of the common genomes + This will cause the genome data to be downloaded from ENSEMBL + :param star_options: list + A list of additional options to pass to STAR + :param cores: int + Number of cores to pass to STAR + :param gff_annotations: bool + Flag for GFF3 (instead of GTF) annotations. If None, it will autodetect .gff files. + :param star_executable: str + Path to the STAR executable + :param move_files: bool + Move the genome/annotation files to a `files` path in the STAR reference genome. If false, just copy. + :return output_path: str + Location where the reference genome has been created + """ + + # Get default genome files from the internet if needed + if (genome_file is None or annotation_file is None) and default_genome is None: + raise ValueError("star_mkref() requires (genome_file AND annotation_file) OR default_genome to be passed") + elif default_genome is not None: + ((genome_url, genome_file), (annotation_url, annotation_file)) = get_genome_file_locs(default_genome) + genome_file = [get_file_from_url(genome_url, genome_file)] + annotation_file = get_file_from_url(annotation_url, annotation_file) + + # Create the output path + output_path = file_path_abs(output_path) + try: + os.makedirs(output_path) + except FileExistsError: + pass + + # Uncompress the genome file if it's gzipped + for i, gf in enumerate(genome_file): + if gf.endswith(".gz"): + subprocess.call(["gunzip", gf]) + genome_file[i] = gf[:-3] + + # Uncompress the annotation file if it's gzipped + if annotation_file.endswith(".gz"): + subprocess.call(["gunzip", annotation_file]) + annotation_file = annotation_file[:-3] + + # Build the STAR executable call + star_call = [star_executable, + "--outFileNamePrefix", os.path.join(file_path_abs(output_path), ''), + "--runThreadN", str(cores), + "--runMode", "genomeGenerate", + "--genomeDir", output_path, + "--genomeFastaFiles", *genome_file, + "--sjdbGTFfile", annotation_file] + + # Add any passed-in options + star_call.extend(star_options) + + # Set a flag for STAR if it's a small genome + # Sum file sizes as a proxy for genome size (approximately correct for ASCII files) + star_sa_idx_size = sum(map(lambda x: os.path.getsize(x), genome_file)) + # Calculate genomeSAindexNbases value with the weird equation from the STAR manual + star_sa_idx_size = int(np.floor(np.log2(star_sa_idx_size) / 2 - 1)) + if star_sa_idx_size < 14: + star_call.extend(["--genomeSAindexNbases", str(star_sa_idx_size)]) + + # Set a flag for STAR if the annotation file is GFF3 + if (gff_annotations is None and ".gff" in annotation_file) or gff_annotations: + star_call.extend(["--sjdbGTFtagExonParentTranscript", "Parent"]) + + # Execute STAR + print(" ".join(star_call)) + subprocess.call(star_call) + + output_file_path = os.path.join(output_path, "files") + try: + os.mkdir(output_file_path) + except FileExistsError: + pass + + if move_files: + file_func = os.rename + else: + file_func = shutil.copy2 + + [file_func(file, os.path.join(output_file_path, os.path.basename(file))) for file in genome_file] + file_func(annotation_file, os.path.join(output_file_path, os.path.basename(annotation_file))) + + return output_path diff --git a/inferelator_prior/processor/utils.py b/inferelator_prior/processor/utils.py new file mode 100644 index 0000000..5788757 --- /dev/null +++ b/inferelator_prior/processor/utils.py @@ -0,0 +1,142 @@ +import argparse +import os +import shutil +import subprocess +import sys +import numpy as np + +if sys.version_info[0] < 3: + print("The inferelator_prior package requires python3") + exit(1) + +import urllib.parse +import urllib.request + +from inferelator_prior import STAR_EXECUTABLE_PATH, PREFETCH_EXECUTABLE_PATH, FASTQDUMP_EXECUTABLE_PATH + +# Tuple of ((fasta_url, fasta_file_name), (gff_url, gff_file_name)) + +_HG38 = (( + "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz", + "hg38.fa.gz"), + ( + "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gtf.gz", + "hg38.gtf.gz")) + +_SC64 = ( +("ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz", + "sc64.fa.gz"), +("ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.gtf.gz", + "sc64.gtf.gz")) + +_MM10 = (( + "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.26_GRCm38.p6/GCF_000001635.26_GRCm38.p6_genomic.fna.gz", + "mm10.fa.gz"), + ( + "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.26_GRCm38.p6/GCF_000001635.26_GRCm38.p6_genomic.gtf.gz", + "mm10.gtf.gz")) + +# Key by genome name +_DEFAULT_GENOMES = {"hg38": _HG38, "sc64": _SC64, "mm10": _MM10} + +# Requirements tests (produce version for each requirement) +_TEST_REQUIREMENTS = {'prefetch': ("", [PREFETCH_EXECUTABLE_PATH, "--version"]), + 'fastq-dump': ("", [FASTQDUMP_EXECUTABLE_PATH, "--version"]), + 'STAR': ("STAR : ", [STAR_EXECUTABLE_PATH, "--version"])} + + +def get_genome_file_locs(genome): + if genome in _DEFAULT_GENOMES.keys(): + return _DEFAULT_GENOMES[genome] + else: + raise ValueError("Genome must be one of {k}".format(k=" ".join(_DEFAULT_GENOMES.keys()))) + + +def get_file_from_url(file_url, file_name_local=None): + """ + Download a file from a url to a local file + :param file_url: + :param file_name_local: + :return: + """ + + if file_name_local is None: + file_name_local = file_path_abs(urllib.parse.urlsplit(file_url).path.split("/")[-1]) + + print("Downloading {url} to {file}".format(url=file_url, file=file_name_local)) + + with urllib.request.urlopen(file_url) as remote_handle, open(file_name_local, mode="wb") as local_handle: + shutil.copyfileobj(remote_handle, local_handle) + + return file_name_local + + +def file_path_abs(file_path): + """ + Convert a file path to a safe absolute path + :param file_path: str + :return: str + """ + return os.path.abspath(os.path.expanduser(file_path)) + + +def test_requirements_exist(test_targets=_TEST_REQUIREMENTS.keys(), test_package=_TEST_REQUIREMENTS, test_htseq=True, + test_chroma=False): + """ + Test that the requirements to run this package exist. Print versions of what can be found and raise a ValueError if + any required software is missing. + :param test_targets: list(str) + A list of packages to test + :param test_package: dict + A dict, keyed by package names, of the commands to run to test the package versions + :param test_htseq: bool + Test for the python HTSeq package + :param test_chroma: bool + Test for the python ChromA package + :return: + """ + + failed = False + + for req, (pref, cmd) in {k: test_package[k] for k in test_targets}.items(): + try: + proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout = pref + " ".join([l for l in proc.stdout.decode().strip().split("\n") if l.strip() != ""]) + print(stdout) + except subprocess.CalledProcessError: + print("{req} : {req} not found (CalledProcessError)".format(req=req)) + failed = True + + if test_htseq: + try: + import HTSeq + print("HTSeq : " + str(HTSeq.__version__)) + except ImportError: + print("HTSeq : HTSeq not found (ImportError)") + failed = True + + if test_chroma: + try: + import ChromA + print("ChromA : " + str(ChromA.__version__)) + except ImportError: + print("ChromA : ChromA not found (ImportError)") + failed = True + + if failed: + raise FileNotFoundError + + return True + + +# ArgumentParser that tests requirements if it fails to parse arguments +# I just want to run a script with no arguments and have it test dependencies +class ArgParseTestRequirements(argparse.ArgumentParser): + + def error(self, message): + try: + test_requirements_exist() + except FileNotFoundError: + pass + finally: + super(ArgParseTestRequirements, self).error(message) diff --git a/inferelator_prior/pwm_to_meme.py b/inferelator_prior/pwm_to_meme.py new file mode 100644 index 0000000..9779ad8 --- /dev/null +++ b/inferelator_prior/pwm_to_meme.py @@ -0,0 +1,34 @@ +from inferelator_prior.motifs.pwm import read +from inferelator_prior.motifs.meme import write + +import argparse +import glob +import os + + +def main(): + ap = argparse.ArgumentParser(description="Parse naked PWM files into a MEME file") + ap.add_argument("-m", "--motif", dest="motif", help="Motif PWM files", metavar="PATH", required=True, nargs="+") + ap.add_argument("-i", "--info", dest="info", help="Motif Info File", metavar="PATH", required=True) + ap.add_argument("-o", "--out", dest="out", help="Output FILE", metavar="FILE", required=True) + + args = ap.parse_args() + + files = [] + for mf in args.motif: + files.extend(glob.glob(os.path.expanduser(mf))) + + pwm_to_meme(files, args.info, args.out) + + +def pwm_to_meme(pwm_file_list, tf_info_file, output_file): + + print("Parsing {x} PWM files".format(x=len(pwm_file_list))) + motifs = read(pwm_file_list, tf_info_file, direct_only=True) + + print("Parsed {m} motifs, writing to file {f}".format(m=len(motifs), f=output_file)) + write(output_file, list(motifs)) + + +if __name__ == '__main__': + main() diff --git a/inferelator_prior/tests/__init__.py b/inferelator_prior/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inferelator_prior/tests/artifacts/M00799_2.00.txt b/inferelator_prior/tests/artifacts/M00799_2.00.txt new file mode 100644 index 0000000..41ecdd1 --- /dev/null +++ b/inferelator_prior/tests/artifacts/M00799_2.00.txt @@ -0,0 +1,10 @@ +Pos A C G T +1 0.248650039776609 0.26139859992769 0.241301320519092 0.248650039776609 +2 0.392226269785661 0.219606847798542 0.0702344472606129 0.317932435155184 +3 0.0257615986027584 0.0113462234969035 0.942829024734492 0.0200631531658465 +4 0.931183694119653 0.0181098604626899 0.00917476304082202 0.0415316823768348 +5 0.0209101275685474 0.0204960793014344 0.00500398009361691 0.953589813036401 +6 0.652610298711976 0.0667517267238459 0.0282467996828702 0.252391174881308 +7 0.541666052707409 0.110459581183674 0.158324966820951 0.189549399287965 +8 0.196060485729292 0.286696226860535 0.350171562229744 0.167071725180429 +9 0.286129811839987 0.225077681884186 0.309960724838339 0.178831781437488 diff --git a/inferelator_prior/tests/artifacts/sc64_mito.fasta b/inferelator_prior/tests/artifacts/sc64_mito.fasta new file mode 100644 index 0000000..2b88a72 --- /dev/null +++ b/inferelator_prior/tests/artifacts/sc64_mito.fasta @@ -0,0 +1,1431 @@ +>Mito dna:chromosome chromosome:R64-1-1:Mito:1:85779:1 REF +TTCATAATTAATTTTTTATATATATATTATATTATAATATTAATTTATATTATAAAAATA +ATATTTATTATTAAAATATTTATTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGGCCCC +GGAATTATTAATTAATAATAAATTATTATTAATAATTATTTATTATTTTATCATTAAAAT +ATATAAATAAAAAATATTAAAAAGATAAAAAAAATAATGTTTATTCTTTATATAAATTAT +ATATATATATATAATTAATTAATTAATTAATTAATTAATAATAAAAATATAATTATAAAT +AATATAAATATTATTCTTTATTAATAAATATATATTTATATATTATAAAAGTATCTTAAT +TAATAAAAATAAACATTTAATAATATGAATTATATATTATTATTATTATTAATAAAATTA +TTAATAATAATCAATATGAAATTAATAAAAATCTTATAAAAAAGTAATGAATACTCCTTT +TTAAAAATAAAAAGGGGTTCGGTCCCCCCCCTTCCGTATACTTACGGGAGGGGGGTCCCT +CACTCCTTCTTAATTAAATTATCTTAATTAAATTATCTTAATTAAATTATCTTAATTAAA +TTATCTTAATTAAATTATCTTAATTAAATTAAAAGGGGACTTTATATTTATAAAGTAATT +ATATTATTATTATTATTATTATTTATTTATTTTATTTTTATTATTTTATTATATATATTA +TATATTAATACAGATAGAAGCCAAAAGGTCAGGCGCTTTCTTTGGGAGAAAGACCTAGTT +AGTTCGAGTCTATCCTATCTGATAATAATTTAATTAACCATTAAAAAAAAGTATATATAT +TTATCATAATATATTAAATTTTATTACATTACAAATGAACACTTTTATTTATATTTATAA +AAATATGAACTCCTTCGGGGTCCGCCCCGCGGGGGCGGGCCGGACTCCATATTATTATTA +TTATAATTATTATTATAATTATTATTATAATTATTATTATAATTATTATTATAATTAAAG +AGTTTTGGATACCAATATGATATAATATGATATAGGACCGAAACCCCTCATTTTATCATT +TATTTATAATATTATAAATAAAAAAAAATATTATATATTATAATAAAATTAATATCATAA +TATATTATATTATATATTATATTATATATATATATATATATATTCTTTTATAAAATTTAT +ATTCTTCTTATTAAAATTAAAAAGGGAGCGGACTTTTAATTATATTTAATTATAGTTTTT +AATCATTGGTTGAGATTTCAAAATAAGGTATAATATTTATATTATTCTTTAACAAATATT +ATATTATAAAAAAAGATATAATATTTATATTATTCTTTAACAAATATTATATTATAAAAA +AGATATAATATTTATATATTATTATTAATATTATTTTTAAGTTCCGAAAGGAGAAACTTA +TAATTTTTATATCATTATTTATTATTATTTTTAATTTCAACTCCTTTTAGGTATTTCCAT +TTAACTTTCAGCAGAGACTTTCTAATTATAATTATATATATATAAATTTAAATACATTTA +TAAAAAAGTATATAATATAATTATATTATATATAATAATATTATTAAATGAAGTATTCTT +TATTATTAATTATAGGATATCTGGGGTCCATTAATAATTATTATTGTAAATAATAATAAG +GACCCCCCCCATTATCTAATTAATAAATATATAAATAATCATTAATAAATATATTAATAA +TTATTAATAAATATATAAATAATCATTAATAAATATATAAATAATATAATATATTATAAA +AATATAATAATAATAATTTATTATTAAAATATAATAATTTATTATAAAAATATAATAATT +TATTATAAAAATATAATAATAACTCCTTTCGGGGTTCACACCTTTATAAATAATAAATAA +TAAATAATAAATAATAAATAATAAATATTAGTATTCACTAATATAAAATAATAATTATAA +AAATAATCATTATTAAAAATATTATTAATTATTAAATTAAATACAATTAATATAATTTAG +TTGTTTATATAATTTTAAATAATGTTTATATCAATTTAATAAAATTAAATTTATAGTTCC +GGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGTTTATCTATATATTATAATAACTA +TATGAATTTAATTATTAAAAATAATAAAAATAAGGAATTTTAATAAGAAGTAATATTTAT +TATATAATATATAAAAAAAATATATATATATATATAAAAATATATATAATAAGTTTTATT +ATAATATATATTAAATTAATTATTATGAGGGGTTCGGTCCCTTTCCGGGCCCCAATTCAT +CTCATCTCATTTTATTTCATTTCAATATCATCTAATCTCATTTCTTTATAGATTTTACAT +ATATATAAATATAAATATAAGATATTCACATTTATATATAATATAATATAATATAATAGA +TATTCATTCCTCTTTGATTAAACTAATAATTAATAATTAATAATTAATAATTAATAATTA +ATAATTATTCAGTAGAACTCCTTCTTAAAAAGGGGTTCGGTCCCCCTCCCATTAGTATAG +TATAGGGAGGGGTCCCTCACTCCTTCGGGGTCCGCCCCGCAGGGGGCGGGCCGGACTATT +ATTAAATAATTTATAATTTATTATTTATTAATATATTTATATAATATAATATAATATAAT +ATTATTCATACTTTTTATTAATATAATATAATATAATATTATTAATACTTTCTCCTTTCG +GGGTTCCGGCTCCCGTGGCCGGGCCCCGGAACTATTAATATAAAGAAAAGAGTTTCAATT +ATTTATTTATTTATTTATTTTTTATAAAAATAAGTCCCCGCCCCGGCGGGGACCCCGAAG +GAGTATTAATTTAAATAATTTATTTAATGAAATTATTAATTATAAATAAAAATAATAATT +TTTAAAGATGTAATATAAAAATAAATATAATATAATTTAGGATAATTATATAAAATATTT +ATTATATATAGTTTTTATAAAGAGTTTTAAAAGTGATAATATAATATATAATATTTATAA +GTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGTTATTTATATATATATAAT +TATAATCTTATTAATTATTTATATATATATTTAATATTATTTTTATATAATTTTATATTA +AAGTATTATAATTATATATTTAATATTATTTTTATATAATTTTATATTATTTATTTATTT +ATTTATTTATTTAAAAATATTATAATCATATATTTAATATTATTTAATATATTTTATATA +TTATATCTTTTATTGATTTATATATATATAGATTTAATAAATATATATATATATATATAT +ATAAATATTCATTATATATTTATTATTATTATTATTATTTATTACTATTTTTTATTATAT +ATTAATAATATATATATTATTAGTTATGGGTATCCTAATAGTATATTATTATTTTTAATA +ATAATTTATGATTTATGTATAATAAATAAGTAGGGAATCGGTACGAATATCGAAAGGAGT +TATATATTATTAATTATTTATAATTATTTTATATATTATTAATTATTTATAATTATTTTA +TATATTTATAATTATTTTATATAGATAGGTTAGATAGGATAGATAGTATAGATAGGGGTC +CCATTTATTATTTACAATAATAATTATTAATGGGACCCGGATATCTTATTGTTATTAATT +TATATATTATTCATTATTATTAATATATATTTAATATAATTAAATATTATATTATATTAT +ATTATATTATTTATTAAAAAAAAATCTATTACTTATTTTTTTTATTAATATATAAATTAT +TTATATAATTTATCATTTTTATTTATATATTATTATTTTTTATATATAAATTAATATATA +TATATATTATATATACTTTTTTTTTTATAATATATCTATATATATAAATAAATATATTAT +ATTATATTTTTATATAATATATTATTAATTATTATTTTAATTTTCTATTCTATTGTGGGG +GTCCCAATTATTATTTTCAATAATAATTATTATTGGGACCCGGATATCTTCTTGTTTATC +ATTTATTATTTTATTAAATTTATTATTATTTTTAATTTATATTTATATTATATAATTAAT +TATATCGTTTATACTCCTTCGGGGTCCCCGCCGGGGCGGGGACTTTATATTTTATTATAT +AATATATTATATTCTTATAATATATTTATTGATTATGTTATAAAATTTATTCTATGTGTG +CTCTATATATATTTAATATTCTGGTTATTATCACCCACCCCCTCCCCCTATTACGTCTCC +GAGGTCCCGGTTTCGTAAGAAACCGGGACTTATATATTTATAAATATAAATCTAACTTAA +TTAATAATTTAAATAATATACTTTATATTTTATAAATAAAAATAATTATAACCTTTTTTA +TAATTATATATAATAATAATATATATTATCAAATAATTATTATTTCTTTTTTTTCTTTAA +TTAATTAATTAATTAATATTTTATAAAAATATATTTCTCCTTACGGGGTTCCGGCTCCCG +TAGCCGGGGCCCGAAACTAAATAAAATATATTATTAATAATATTATATAATATAATAATA +ATATAATAATTTTATATAAATATATATTTATATATTAAATTAAATTATAATTTTATTATG +AAAATTATATCTTTTTTTTATATTTTTATATAATAAAAATATGTTATATATATATTAATA +ATAAAAGGTAGTGAGGATTAAATAAATTATATAATAATTATAACTCTTAATTATAAAATA +AATATATATATATATATAAGTATCCATTTCCATATAATCTTTTAATAAATATTAATAAAT +ATTAAAAAAAAATAATATTATAATATTTTAGTATATAATTCAATAAAATTCATTGGAGGG +GTAAATAATAATAATTTACTAATGGCAAGTTATAGTCTTAAAGGTTTTTATTTTTTTTAT +TAAATTAATAAAATAATAATACCATTTATATATTCCATTATATATATATATTTAATAAAA +ATAATAATATCATTTATATATTTTATTATATATTATATATATTTTATATAAAATAATAAT +AATAAATTTATATTTTTATATATTATTATTAAATAATAATAATATAAATAACTCCTTCGG +GGTTCGGTCCCCACGGGTCCCTCACTCCTTCTTAAGAATAAAAAGGGGTTCGGTCCCCCT +CCCGTTAGTACACGGGAGGGGGTCTCTCACTCCTTCTTAAAAAATAAAAAGGTGGAAGGA +CTAATATAATTTTAAATAATAATTAATACTTTAATAATAATTTGTATTTCTTTATTATTA +ATATATTAAATATAATAATAATTAATATAATTACAATATATTAATATTATCAAATATTAA +TAAATATACTTTTTTATATAATTTATTTATTTATTTATTTTTTTTTTATTAAACTAATTA +TAATTGTAATTTCGAAAAGGGGGTGGGAGTAAACATATATAATTTATAATCTATATATAT +ATATATATAATTTTTTAATAAATATTAATAAATATTTATAAAAAGAATAATTTATATTTA +TAATATATAATTTATATATTTTATTTTTATTATACAATTAATATAAAATATAAAATATTA +AATATTAAATATTAAATATTAAATATTAAATATTAATTTTTATAGGGGTTATATAATAAT +TATATTTATAATTATATAATATTAAAAAGGGTATTTTTATAATTATTACATTTTTATTTT +ATTTATAAAAATATTAATTTTAATAAGTATTGAATACTTTATATAATATAAATATTAATT +ACATAATTAATAATTAAATAATATTTAATAATATTATTTAAATTTATTATTTATAATTAT +TTATTTATAAAATTCTATTTTTATTATTATTATTTTTATTTTATTATTAAAGATTAATAT +AATAATTATTAATATATTAAAAATCTTTTATTATATTAATATTTATAAAAAAGTATTTAA +TAAAAAAGATGTATAAATTTATAAATTATATAATATTATTAATTTATATAATAATAATAT +TATAACTTTGTGATTGTCAATTTAGTTAATCATTGTTATTAATAAAGGAAAGATATAAAA +AATATTCTCCTTCTTAAAAAGGGGTTCGGTTCCCCCCCGTAAGGGGGGGGTCCCTCACTC +CTTTGGTCGGACTCCTTCGGGGTCCGCCCCGCGGGGGCGGGCCGGACTAATTTAACTTTT +AATATTAATATTAATATTATTTATATTTTTAATATATAAAAATAAATAATTTTATTTTTA +TTAATAGTATATTATATAAACAATAAAATAGTATTAATTATATAAAATTTATATAAAATA +TATATAAATTTATTATATATATATATATTAATATTTTAATAAAGTTTTTATTATAAATTT +ATTTATTTATTTATTATAATATTAATAATTTATTTATTATTATATAAGTAATAAATAATA +GTTTTATATAATAATAATAATATATATATATATATATTATTATATTAGTTATATAATAAG +GAAAAGTAAAAAATTTATAAGAATATGATGTTGGTTCAGATTAAGCGCTAAATAAGGACA +TGACACATGCGAATCATACGTTTATTATTGATAAGATAATAAATATGTGGTGTAAACGTG +AGTAATTTTATTAGGAATTAATGAACTATAGAATAAGCTAAATACTTAATATATTATTAT +ATAAAAATAATTTATATAATAAAAAGGATATATATATAATATATATTTATCTATAGTCAA +GCCAATAATGGTTTAGGTAGTAGGTTTATTAAGAGTTAAACCTAGCCAACGATCCATAAT +CGATAATGAAAGTTAGAACGATCACGTTGACTCTGAAATATAGTCAATATCTATAAGATA +CAGCAGTGAGGAATATTGGACAATGATCGAAAGATTGATCCAGTTACTTATTAGGATGAT +ATATAAAAATATTTTATTTTATTTATAAATATTAAATATTTATAATAATAATAATAATAA +TATATATATATAAATTGATTAAAAATAAAATCCATAAATAATTAAAATAATGATATTAAT +TACCATATATATTTTTATATGGATATATATATTAATAATAATATTAATTTTATTATTATT +AATAATATATTTTAATAGTCCTGACTAATATTTGTGCCAGCAGTCGCGGTAACACAAAGA +GGGCGAGCGTTAATCATAATGGTTTAAAGGATCCGTAGAATGAATTATATATTATAATTT +AGAGTTAATAAAATATAATTAAAGAATTATAATAGTAAAGATGAAATAATAATAATAATT +ATAAGACTAATATATGTGAAAATATTAATTAAATATTAACTGACATTGAGGGATTAAAAC +TAGAGTAGCGAAACGGATTCGATACCCGTGTAGTTCTAGTAGTAAACTATGAATACAATT +ATTTATAATATATATTATATATAAATAATAAATGAAAATGAAAGTATTCCACCTGAAGAG +TACGTTAGCAATAATGAAACTCAAAACAATAGACGGTTACAGACTTAAGCAGTGGAGCAT +GTTATTTAATTCGATAATCCACGACTAACCTTACCATATTTTGAATATTATAATAATTAT +TATAATTATTATATTACAGGCGTTACATTGTTGTCTTTAGTTCGTGCTGCAAAGTTTTAG +ATTAAGTTCATAAACGAACAAAACTCCATATATATAATTTTAATTATATATAATTTTATA +TTATTTATTAATATAAAGAAAGGAATTAAGACAAATCATAATGATCCTTATAATATGGGT +AATAGACGTGCTATAATAAAATGATAATAAAATTATATAAAATATATTTAATTATATTTA +ATTAATAATATAAAACATTTTAATTTTTAATATATTTTTTTATTATATATTAATATGAAT +TATAATCTGAAATTCGATTATATGAAAAAAGAATTGCTAGTAATACGTAAATTAGTATGT +TACGGTGAATATTCTAACTGTTTCGCACTAATCACTCATCACGCGTTGAAACATATTATT +ATCTTATTATTTATATAATATTTTTTAATAAATATTAATAATTATTAATTTATATTTATT +TATATCAGAAATAATATGAATTAATGCGAAGTTGAAATACAGTTACCGTAGGGGAACCTG +CGGTGGGCTTATAAATATCTTAAATATTCTTACATAAATATTAATCTAAATATTAATATA +AATATTAATATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGAAA +TATTAATATAAATATAAATATTAATATAAATATAAATATAAATATAAATATATTTTAATA +TAATATAATATAATATATAATATATTATATAAATATAATATATAAATAATATAATAAAAT +ATTTTAATATATATATAATATAATATAATTATTATTATAATTTAATATAAATTATTATTA +TAATTTAATATAATAAATAAATAAATAATTATAATTATAATTATAATTATAATCTCAATA +TATAAATGATAAATTATTATAAATACAAAGGAAATAATTGATTTTTAAAATATATTTAAT +AAAATATATAATATAAATTATACTTTTTTTGTTATTATATAATAATTATATTAATATATT +TAATAGAATTAAACTCCTTCGGCCGGACTATTATTCATTTTATATATTAATGATAAATCA +TTAATTATTATTAATAAATTTATTTATAATATTTAATTTTATATATTATTATTTATAATA +AAAAAAATTATATTATAACAATTTAATTTTAATTTTTATTTTTAAATTATAAAATTAATA +ATTTATTTGTTTAAATAAAATTTATAACTCCTTCGGGGTTCGGCCGGACTATTAATATAA +ATAAATAATAAATATTTATAATAAAATAATATACATCTTCTTTAAATAAAAAAAGGGGAC +ATTATAAATAGTATATAAATATATTATATCTTTTTTATTATTATTATTAATAAATAATAA +TAATAATTTATATATTTATAATATATTTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGA +ACCCCGAAAGGAGAATGTATTATAATTATTACATATAATTATTATTATTCACTTCTTATT +AAAAATAATACTCTATATAATTTATATAATTTATTTTAATATATATATATTTATATATAA +TATAATATATATATTTATTTATTATAATCATTTTTTTTTAACTTAAAATAAAACTTATTA +TAATTTATATAATTTATAATTTTTATATAAAAATAATTATATAATTTTTATTTATTTATA +TAATAATAATATTATTTGTTATATATTATATATTATATATATAATAAATAAATAAATAAT +AAATAATAATAATAAGGATATAGTTTAATGGTAAAACAGTTGATTTCAAATCAATCATTA +GGAGTTCGAATCTCTTTATCCTTGATAATAATAATAAAAATATGTATTTATTTAATTATT +TTAATATTTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGGCCCCGGAACTATTAATATA +ATATAATATAATATAAATATTCATTTATCTTTTTTTTAATATTCTTAATTAATTAATTAA +TTAATATATTAATTATAAAAAATATATTATAATTTTATTATTAATAAGTATAAATATATT +ATTAATAATAATTTATTAAAAATATATTATTATAATATATTAATATATCATAATTATAAT +CAATATTATATTATTTAATTTTATAATACTTAATTATTAATATATTATTCATATATATAT +AAATTAAATTAAATTAATTATATTGAATATATAAATATATATATATATAAATATATAAAA +AATTATATAAATTATTTTAAGTAAAAATAATATTAATAAAAATTATACAATAATAATAAT +AAATATTCATTATTATTTAATTAATATCTCCTTTACTTCTTTTTCCTCCGTTGAGGACTT +ATTATTAAGTATATTATTATATACTACTTAAGATTATATATATAATATATATATATATAT +TATATATAAAATATAAATATATAAATAATATAAAAATTAATAAAATAAATAAAATAAATT +AGTCCGATCGAATCCCCTATTTAATTAAATTAAATTAAATTAAGAAAGAGATAAATTTAT +ATAAAATATTATTTATAATTAATTATAATTAAATTATAATATAATATAATATAAATAATA +ATATAATAAAAATAAAAATAAAATAATATTAGATTATATTATATAATTTATATAATTTTT +TAATAATAATAATAAATAAGTTTATTTATAATTATAAATATAAATATAAATATAAATAAA +GAAGGTATTATATTTTATAAAATATAATAATAATACAAAATTTATATTTTAATAAATATT +AATATAAGTTTAAAGTTCCGGGGCCCGGCACGGGAGCCGGAACCCCGAAAGGAGAAATAA +ATAATATATTTATAAAAAATTAAATAAATAAATATTATCTATTTAAAAATAAATATAATA +TAATATAATATAATAATTCTAAATATAAATAATATTTATTATAATTATTATAATAATTGT +ATTATTTATTAATAATATATATAATTATATTAAAACTAATATTACATTATTTTGTATATT +TAAACAATTAAATTGATTATTCTTATTTGTAATCTTTATTTATTTTATTATATCTTATTA +ATGATAAATTATAATTATTATTAAAATAATAATTTACTTCTTTTGATATAAAAATAAAAT +AATATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGGAAGGAGATAAATATATTAT +ATTTTTATTCCTACCTATTAAAGGTAAAGACTCGATTCTCATAATTAAATTTATATCCTT +CGGCCGGATTAATTTATTTTATTTATATTTATATTTATAGTGAATACCTTTTTTAATATT +TATTTTTAATATTTATTTTTAATATTTTATTTTTAATAAAATATAATCTTGTAAGTAAGA +AAAGAATTTCGGTGATTGGAACCTTGAAAGGATAAATTTCTTATTTATTATAATATTTAT +ATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGTATTATTAAACA +TTTAATATATTATATTAATATTTAATTTAAATGATTAATATATTATTATAATAATATTTA +TTTTATATTAAAATATTATAATTAATATATATATATTTATTTTAATAATATTATTATTAT +TATTATTAAAATTATTATTTTTATAAATATATATATATATATATATATATTATTTTTATT +CTTATATAAATTATATAAAAAAAATATATATAATATATAATTAATTAATATATATTATTT +AAATTATATATTATTTAAAATACTTTTTATATTATATCTTCTTTAAATTAAAATATAATT +ATTATTTATATTATAATTATTTATGAAATATTATTATTAAAATAAAAAAGAGGTTTAGAC +TATATATTTATTATTTATAAACTTATTATATTATTTATTATTAATAGTTCCGGGGCCCGG +CCACGGGAGCCGGAACCCCGAAAGGAGAAATAAATAAAATAAAAAATAATAAATATTAAT +ATTATTAAATATTATTTATAATAAATATTAATATTATTAAATATTATTCATATTAATAAA +TTTTATTATTATTTGTAATATATTAAATATTAATAATATATATATTATTTATTATAATGA +AAACCTATCCTATATTATCCTATCATATAATATCATATCATATTATATTATATCTTATTA +TATGATATATAAAGTATTCACTCTATATGAGGTTATGATTATTATATAAATCTTATTTTA +TTTTTATTTTTATTTGGACTAATAATAATTATAATAATAATTATTGATATGTTCTAATAT +TAATAAATACATATTTATATTATAATATAAATATTCATTTCTTACTAATTAATAAAAAGT +TTTTATATTCATTATAATATAAATATATAAATATATATAAATATTTTAATAATTATAATT +ATATTAAGATATTATAAATATATATTTATTTTTTTTTATAAAATAAATAAATAAATAAAT +AATTAATATTTTTATATTATAACTTATTTTTATAATAATAATAAGTATTTTATTTTTTAT +TATATTATTATTTATATAATTATATATATATTAATTTCAATTTAATTAATTAATTAATTG +GTATTTGGCATATAATATCAATTAATTGTAATTCTTATAAGAATTAATTAATTAATATGC +TTTTTATATAATTTATACTTTTATATTTCTCCTTCCGGGGTTCCGGCTCCCGTGGCCGGG +CCCCGGAACTATTATTATTATTTTTATTTATTTATTATTAAAATATAATAATAAATAGTC +CGGCCCGCCCCGCGGGGCGGACGCCGGAGGAGAATTATATTTTTATATAATAATTTATAT +TTCTATATATATATATATATATTATATATAAATATTATTATATATATTTTTATATATATT +ATAATTATATTCATTAATATTTTATTATAGTGGTGGGGTCCCAATTATTATTTTCAATAA +TAATTTATCATGGGACCCGGATATCTTCTTGTTTTTATTTATTATTTTATTAAATTTATT +TTAATTATTTATTTATAATTTATATTATACAATTTATTATTTCGTTAATACCTTTATTTA +TATTATATAATATATTATATTATTATAATATATTTATTGATTATATTAATACATTTAACT +AATGTGTGCTCTATATTTATTGAATAGTTTGGTTCTTATCACCCACCCCCTCCCCCTATT +ACGTCTCCGAGGTCCCGGTTTCGTAAGAAACCGGGACTTATATATTTAATACTAAAAATA +TAACTACATTACTTTTTTAATATATATAACAATATATATATATATATATATTAATTATAT +AAAATATAATACTCTATATTAAATATTATTTTTATCAATATTTATTTATATATATAATAA +TAATAATAATAATCAATATTAATTATTTATATATATAAGATTAATATTATTTAATATATT +ATGAATAATTTAATTAATAAATCTTTAAATATTATCATAAAAATATAAATTAAATAATTT +CTTATTTATAATAAAGAATAATAATATATATAAATATAATAAAGAATGTAAATAATATAT +ATATAATATAATATAATATAAAAAATATATATATATATAAATATATATATAATATATAGA +TAATAATATTTTTATATAATTTATTTTATTATTAAGTAATAAATAATAAAAAAATCAATA +TATTAAATAATATATTTATATTAGTTCGGTTTAGTTGGTATTTTGTAATGAGTAAAAAGT +AATATATAATATTAAATAATAAGTATTGATATAAGTAATAGATATAATAATAATATTATT +AATATTTTATATAAATAATATTAATAATATAGATTATGAAAGAGAGTATTAATATCATTA +AATATATATATATGTTATATAATTTAAATGATTTTAATATATATATATATATTATATTAT +AGATTATGATACATTTATATAAATAATATATATATAAAAATTAATTATACTATTACTTTA +TAATATAATAATATTTATTTATAAAGATATAAAAGAATTGTTTAAAGTTATAACTAAAAT +ATTATATAGTATTCATTAATAATTAATATTATAAATTCAACTATTGTTATATTTATAAAT +AGAATAATATATTATTATCCTTTAAGATATAACAATAATTATTTAAATTAAATTAAATTA +AATTTAATTAATTTTTTTTTTTAATGAATATAATAATAATAATATTATTAAAATTAATAT +ATAAAAAAAAAGTAAAAATGGTACAAAGATGATTATATTCAACAAATGCAAAAGATATTG +CAGTATTATATTTTATGTTAGCTATTTTTAGTGGTATGGCAGGAACAGCAATGTCTTTAA +TCATTAGATTAGAATTAGCTGCACCTGGTTCACAATATTTACATGGTAATTCACAATTAT +TTAATGGTGCGCCTCTCAGTGCGTATATTTCGTTGATGCGTCTAGCATTAGTATTATGAA +TCATCAATAGATACTTAAAACATATGACTAACTCAGTAGGGGCTAACTTTACGGGGACAA +TAGCATGTCATAAAACACCTATGATTAGTGTAGGTGGAGTTAAGTGTTACATGGTTAGGT +TAACGAACTTCTTACAAGTCTTTATCAGGATTACAATTTCCTCTTATCATTTGGATATAG +TAAAACAAGTTTGATTATTTTACGTTGAGGTAATCAGATTATGATTCATTGTTTTAGATA +GCACAGGCAGTGTGAAAAAGATGAAGGACCTAAATAACACAAAAGGAAATACGAAAAGTG +AGGGATCAACTGAAAGAGGAAACTCTGGAGTTGACAGAGGTATAGTAGTACCGAATACTC +AAATAAAAATGAGATTTTTAAATCAAGTTAGATACTATTCAGTAAATAATAATTTAAAAA +TAGGGAAGGATACCAATATTGAGTTATCAAAAGATACAAGTACTTCGGACTTGTTAGAAT +TTGAGAAATTAGTAATAGATAATATAAATGAGGAAAATATAAATAATAATTTATTAAGTA +TTATAAAAAACGTAGATATATTAATATTAGCATATAATAGAATTAAGAGTAAACCTGGTA +ATATAACTCCAGGTACAACATTAGAAACATTAGATGGTATAAATATAATATATTTAAATA +AATTATCAAATGAATTAGGAACAGGTAAATTCAAATTTAAACCCATGAGAATAGTTAATA +TTCCTAAACCTAAAGGTGGTATAAGACCTTTAAGTGTAGGTAATCCAAGAGATAAAATTG +TACAAGAAGTTATAAGAATAATTTTAGATACAATTTTTGATAAAAAGATATCAACACATT +CACATGGTTTTAGAAAGAATATAAGTTGTCAAACAGCAATTTGAGAAGTTAGAAATATAT +TTGGTGGAAGTAATTGATTTATTGAAGTAGACTTAAAAAAATGTTTTGATACAATTTCTC +ATGATTTAATTATTAAAGAATTAAAAAGATATATTTCAGATAAAGGTTTTATTGATTTAG +TATATAAATTATTAAGAGCTGGTTATATTGATGAGAAAGGAACTTATCATAAACCTATAT +TAGGTTTACCTCAAGGATCATTAATTAGTCCTATCTTATGTAATATTGTAATAACATTGG +TAGATAATTGATTAGAAGATTATATTAATTTATATAATAAAGGTAAAGTTAAAAAACAAC +ATCCTACATATAAAAAATTATCAAGAATAATTGCAAAAGCTAAAATATTTTCGACAAGAT +TAAAATTACATAAAGAAAGAGCTAAAGGCCCACTATTTATTTATAATGATCCTAATTTCA +AGAGAATAAAATACGTTAGATATGCAGATGATATTTTAATTGGGGTATTAGGTTCAAAAA +ATGATTGTAAAATAATCAAAAGAGATTTAAACAATTTTTTAAATTCATTAGGTTTAACTA +TAAATGAAGAAAAAACTTTAATTACTTGTGCAACTGAACTACCAGCAAGATTTTTAGGTT +ATAATATTTCAATTACACCTTTAAAAAGAATACCTACAGTTACTAAACTAATTAGAGGTA +AACTTATTAGAAGTAGAAATACAACTAGACCTATTATTAATGCACCAATTAGAGATATTA +TCAATAAATTAGCTACTAATGGATATTGTAAGCATAATAAAAATGGTAGAATAGGAGTGC +CTACAAGAGTAGGTAGATGACTATATGAAGAACCTAGAACAATTATTAATAATTATAAAG +CGTTAGGTAGAGGTATCTTAAATTATTATAAATTAGCTACTAATTATAAAAGATTAAGAG +AAAGAATCTATTACGTATTATATTATTCATGTGTATTAACTTTAGCTAGTAAATATAGAT +TAAAAACAATAAGTAAAACTATTAAAAAATTTGGTTATAATTTAAATATTATTGAAAATG +ATAAATTAATTGCCAATTTTCCAAGAAATACTTTTGATAATATCAAAAAAATTGAAAATC +ATGGTATATTTATATATATATCAGAAGCTAAAGTAACTGATCCTTTTGAATATATCGATT +CAATTAAATATATATTACCTACAGCTAAAGCTAATTTTAATAAACCTTGTAGTATTTGTA +ATTCAACTATTGATGTAGAAATACATCATGTTAAACAATTACATAGAGGTATATTAAAAG +CACTTAAAGATTATATTCTAGGTAGAATAATTACCATAAACAGAAAACAAATTCCATTAT +GTAAACAATGTCATATTAAAACACATAAAAATAAATTTAAAAATATAGGACCTGGTATAT +AAAATCTATTATTAATGATACTCAATATGGAAAGCCGTATGATGGGAAACTATCACGTAC +GGTTTGGGAAAGGCTCTTTAACACGTGGCAACATAGGTTAATTTGCTATTTCATTTTTAG +TAGTTGGTCATGCTGTATTAATGATTTTCTGTGCGCCGTTTCGCTTAATTTATCACTGTA +TTGAAGTGTTAATTGATAAACATATCTCTGTTTATTCAATTAATGAAAACTTTACCGTAT +CATTTTGGTTCTGATTATTAGTAGTAACATACATAGTATTTAGATACGTAAACCATATGG +CTTACCCAGTTGGGGCCAACTCAACGGGGACAATAGCATGCCATAAAAGCGCTGGAGTAA +AACAGCCAGCGCAAGGTAAGAACTGTCCGATGGCTAGGTTAACGAATTCCTGTAAAGAAT +GTTTAGGGTTCTCATTAACTCCTTCCCACTTGGGGATTGTGATTCATGCTTATGTATTGG +AAGAAGAGGTACACGAGTTAACCAAAAATGAATCATTAGCTTTAAGTAAAAGTTGACATT +TGGAGGGCTGTACGAGTTCAAATGGAAAATTAAGAAATACGGGATTGTCCGAAAGGGGAA +ACCCTGGGGATAACGGAGTCTTCATAGTACCCAAATTTAATTTAAATAAAGTGAGATACT +TTAGTACTTTATCTAAATTAAATGCAAGGAAGGAAGACAGTTTAGCGTATTTAACAAAGA +TTAATACTACGGATTTTTCCGAGTTAAATAAATTAATAGAAAATAATCATAATAAACTTG +AAACCATTAATACTAGAATTTTAAAATTAATGTCAGATATTAGAATGTTATTAATTGCTT +ATAATAAAATTAAAAGTAAGAAAGGTAATATATCTAAAGGTTCTAATAATATTACCTTAG +ATGGGATTAATATTTCATATTTAAATAAATTATCTAAAGATATTAACACTAATATGTTTA +AATTTTCTCCGGTTAGAAGAGTTGAAATTCCTAAAACATCTGGAGGATTTAGACCTTTAA +GTGTTGGAAATCCTAGAGAAAAAATTGTACAAGAAAGTATGAGAATAATATTAGAAATTA +TCTATAATAATAGTTTCTCTTATTATTCTCATGGATTTAGACCTAACTTATCTTGTTTAA +CAGCTATTATTCAATGTAAAAATTATATGCAATACTGTAATTGATTTATTAAAGTAGATT +TAAATAAATGCTTTGATACAATTCCACATAATATGTTAATTAATGTATTAAATGAGAGAA +TCAAAGATAAAGGTTTCATAGACTTATTATATAAATTATTAAGAGCTGGATATGTTGATA +AAAATAATAATTATCATAATACAACTTTAGGAATTCCTCAAGGTAGTGTTGTCAGTCCTA +TTTTATGTAATATTTTTTTAGATAAATTAGATAAATATTTAGAAAATAAATTTGAGAATG +AATTCAATACTGGAAATATGTCTAATAGAGGTAGAAATCCAATTTATAATAGTTTATCAT +CTAAAATTTATAGATGTAAATTATTATCTGAAAAATTAAAATTGATTAGATTAAGAGACC +ATTACCAAAGAAATATGGGATCTGATAAAAGTTTTAAAAGAGCTTATTTTGTTAGATATG +CTGATGATATTATCATTGGTGTAATGGGTTCTCATAATGATTGTAAAAATATTTTAAACG +ATATTAATAACTTCTTAAAAGAAAATTTAGGTATGTCAATTAATATAGATAAATCCGTTA +TTAAACATTCTAAAGAAGGAGTTAGTTTTTTAGGGTATGATGTAAAAGTTACACCTTGAG +AAAAAAGACCTTATAGAATGATTAAAAAAGGTGATAATTTTATTAGGGTTAGACATCATA +CTAGTTTAGTTGTTAATGCCCCTATTAGAAGTATTGTAATAAAATTAAATAAACATGGCT +ATTGTTCTCATGGTATTTTAGGAAAACCCAGAGGGGTTGGAAGATTAATTCATGAAGAAA +TGAAAACCATTTTAATGCATTACTTAGCTGTTGGTAGAGGTATTATAAACTATTATAGAT +TAGCTACCAATTTTACCACATTAAGAGGTAGAATTACATACATTTTATTTTATTCATGTT +GTTTAACATTAGCAAGAAAATTTAAATTAAATACTGTTAAGAAAGTTATTTTAAAATTCG +GTAAAGTATTAGTTGATCCTCATTCAAAAGTTAGTTTTAGTATTGATGATTTTAAAATTA +GACATAAAATAAATATAACTGATTCTAATTATACACCTGATGAAATTTTAGATAGATATA +AATATATGTTACCTAGATCTTTATCATTATTTAGTGGTATTTGTCAAATTTGTGGTTCTA +AACATGATTTAGAAGTACATCACGTAAGAACATTAAATAATGCTGCCAATAAAATTAAAG +ATGATTATTTATTAGGTAGAATGATTAAGATAAATAGAAAACAAATTACTATCTGTAAAA +CATGTCATTTTAAAGTTCATCAAGGTAAATATAATGGTCCAGGTTTATAATAATTATTAT +ACTATTAAATATGCGTTAAATGGAGAGCCGTATGATATGAAAGTATCACGTACGGTTCGG +AGAGGGCTCTTTTATATGAATGTTATTACATTCAGATAGGTTTGCTACTCTACTCTTAGT +AATGCCTGCTTTAATTGGAGGTTTTGGTAACCAAAAAAGATATGAAAGTAATAATAATAA +TAATCAAGTAATAGAAAATAAAGAATATAATTTAAAATTAAATTATGATAAGTTGGGACC +TTATTTAGCTGGATTAATTGAAGGTGATGGAACTATTCTAGTTCAAAATTCATCTTCAAT +AAAAAAATCTAAATATAGACCGTTAATTGTTGTAGTATTTAAATTAGAAGATTTAGAATT +AGCTAATTATTTATGTAATTTAACTAAATGTGGAAAAGTGTATAAAAAAATTAATCGTAA +TTATGTATTATGACTTATTCATGATTTAAAAGGTGTATATACATTATTAAATATTATTAA +TGGATATATGAGAACACCTAAATATGAAGCATTTGTTAGAGGTGCTGAATTTATAAATAA +TTATATTAATTCAACAACAATTCTACATAATAAATTAAAAAATATAGATAATATTAAAAT +TAAACCATTAGATACATCAGATATTGGTTCAAACGCTTGATTAGCTGGTATGACAGATGC +AGATGGTAATTTTTCTATTAATTTAATAAATGGTAAAAATCGTTCTAGTAGAGCAATGCC +TTATTATTGTTTAGAATTAAGACAAAATTATCAAAAAAATTCTAATAATAATAATATTAA +TTTTTCTTATTTTTATATTATGTCTGCAATTGCACTATATTTTAATGTTAATTTATATAG +TAGAGAACGTAATTTAAATTTATTAGTATCTCTTAATAATACGTATAAACTATATTATAG +TTATAAAGTAATAGTGGCTAATCTATATAAAAATATTAAAGTAATAGAATACTTTAATAA +ATATTCTTTATTATCATCTAAACACTTAGATTTTTTAGATTGATCTAAATTAGTTATTTT +AATTAATAATGAGGGTCAAAGTATAAAACTTAATGGTAGTTGAGAATTAGGTATAAATTT +ACGTAAAGATTATAATAAAACTAGAACTACGTTTACTTGATCTCATTTAAAAAATACATA +TTTAGAAAATAAATAAATAAATTATTATTACTTTCTTCCCCTCCGAATCCGTAATATATT +TACGGATATATAATCTCGTAGTGTAAAAGGTGTAACGAGATTATTAATAAGTTGCCGTAA +TATATTGTAAAATATATTATTATTACAACACTATATGCGGGAAAACCCTAAAGTCATAAT +ATAATATTATCCCCACGAGGGCCACACATGTGTGGCCCTCGCGGGGTATGGTAAATTTAA +TTAAGTTATAAATGTACTATAGTATTAAAAATTATTATGAATAATTTCCCCACCCCCATG +CGAAGCATGGGGGGGGGTATAAGTATGGACAATCCGCAGGAAACCAAATAATAATTAATA +TCCTGAAACAAAGTAAGTGAAGGAGATATCTTAAAATATATATAATATATATTTTATAAA +TTATTATGTAGGATCCTCAGAGACTACACGTGTTGCACCCATTATATTATGTATAATGGG +TTGAAGATATAGTCCAAATATAATTGAAAGATTATAATAAAATGAACTATTTATTACCAT +TAATAATTGGAGCTACAGATACAGCATTTCCAAGAATTAATAACATTGCTTTTTGAGTAT +TACCTATGGGGTTAGTATGTTTAGTTACATCAACTTTAGTAGAATCAGGTGCTGGTACAG +GGTGAACTGTCTATCCACCATTATCATCTATTCAGGCACATTCAGGACCTAGTGTAGATT +TAGCAATTTTTGCATTACATTTAACATCAATTTCATCATTATTAGGTGCTATTAATTTCA +TTGTAACAACATTAAATATGAGAACAAATGGTATGACAATGCATAAATTACCATTATTTG +TATGATCAATTTTCATTACAGCGTTCTTATTATTATTATCATTACCTGTATTATCTGCTG +GTATTACAATGTTATTATTAGATAGAAACTTCAATACTTCATTCTTTGAAGTATCAGGAG +GTGGTGACCCAATCTTATACGAGCATTTATTTTGATTCTTTGGTCAAACAGTGGCCCTTA +TTATTATATTAATAATATATAATGATATGCATTTTTCTAAATGCTGGAAATTATTAAAAA +AATGAATTACAAATATTATAAGTCTATTATTTAAAGCCTTATTTGTAAAAATATTCATAT +CTTATAATAATCAGCAGGATAAGATAATAAATAATCTTATATTAAAAAAAGATAATATTA +AAAGATCCTCAGAGACTACAAGAAAAATATTAAATAATTCAATAAATAAAAAATTTAATC +AATGATTAGCTGGATTAATTGATGGTGATGGATATTTTGGTATTGTAAGTAAGAAATATG +TATCATTAGAAATTCTAGTAGCATTAGAAGATGAAATAGCTTTAAAAGAAATTCAAAATA +AATTTGGTGGTTCTATTAAATTAAGATCAGGTGTAAAAGCTATTAGATATAGATTACTTA +ATAAAACTGGTATAATTAAATTAATTAATGCAGTTAATGGTAATATTAGAAATACTAAAA +GATTAGTACAATTTAATAAAGTTTGTATTTTATTAGGTATTGATTTTATTTATCCAATTA +AATTAACTAAAGATAATAGTTGATTTGTTGGATTTTTTGATGCTGATGGTACAATTAATT +ATTCATTTAAAAATAATCATCCTCAATTAACAATTTCTGTAACTAATAAATATTTACAAG +ATGTACAAGAATATAAAAATATTTTAGGTGGTAATATTTATTTTGATAAATCACAAAATG +GTTATTATAAATGATCCATTCAATCAAAAGATATAGTATTAAATTTTATTAATGATTATA +TTAAAATAAATCCATCAAGAACACTAAAAATAAATAAATTATATTTAAGTAAAGAATTTT +ATAATTTAAAAGAATTAAAAGCTTATAATAAATCTTCTGATTCAATACAATATAAAGCAT +GATTAAATTTTGAAAATAAATGAAAAAATAAATAAATTATTTAATAAAGATATAGTCCAA +ATTATATATATATAATATATATATATATAACAAGCACCCTGAAGTATATATTTTAATTAT +TCCTGGATTTGGTATTATTTCACATGTAGTATCAACATATTCTAAAAAACCTGTATTTGG +TGAAATTTCAATGGTATATGCTATGGCTTCAATTGGATTATTAGGATTCTTAGTATGATC +ACATCATATGTATATTGTAGGATTAGATGCAGATCTTAGAGCATATTTCCTATCTGCACT +AATGATTATTGCAATTCCAACAGGAATTAAAATTTTCTCATGATTAATAAATCCCTTTAG +CAAGGATAAAAATAAAAATAAAAATAAAAAGTTGATCAGAAATTATCAAAAAATAAATAA +TAATAATATAATAAAAACATATTTAAATAATAATAATATAATTATAATAAATATATATAA +AGGTAATTTATATGATATTTATCCAAGATCAAATAGAAATTATATTCAACCAAATAATAT +TAATAAAGAATTAGTAGTATATGGTTATAATTTAGAATCTTGTGTTGGTATACCTCTATA +TACTAATATTGTAAAACATATAGTAGGTATTCCTAATAATATTTTATATATTATAACAGG +TATTTTATTAACAGATGGTTGAATTGATTATCTATCTAAAAAAGATTTAGATAAAAAAAC +AATTATAGAAATTAATTGTAGATTTAGATTAAAACAATCAATAATTCATAGTGAATATTT +AATATATGTATTTATATTATTATCACATTATTGTATAAGTTATCCTAAAATAAAAATTGC +TAAAGTTAAAGGTAAATCATATAATCAATTAGAATTTTATACTAGATCATTACCATGTTT +TACTATTTTAAGATATATATTTTATAATGGTAGAGTAAAAATTGTACCTAATAATTTATA +TGATTTATTAAATTATGAATCTTTAGCTCATATAATTATATGTGATGGTTCATTTGTAAA +AGGTGGAGGTTTATATTTAAATTTACAATCTTTTCTAACTAAAGAATTAATTTTTATTAT +AAATATTTTAAAAATTAAATTTAATTTAAATTGTCTATTACATAAATCTAGAAATAAATA +TCTTATTTATATAAGAGTAGAATCTGTTAAAAGATTATTTCCTATAATTTATAAATATAT +TTTACCTTCTATAAGATATAAATTTGATATTATATTATGACAAAAAAAATATAATATGAT +TAATTAATTAATTAATTAATTAATTTATTTATTATTTACTTTTTTGATATATATAGAGGC +AAACTCGAGGAAAACCATATAATTAGAATAAGTAATAATTATATGACAACCGTCGAACTA +AATCATATTCAAGAAATTAATATGTAAAAGCGTAGAGATTAGACGCCTCTGGTTATCTAA +GTAATATATATATATATATTATATGATAACATAAGGTATAATCCAATGAGATCAGTAATG +ATTTTAAAACAATAATTTTGTTTTAAGTATTAATAATAATATTAATATTCGACCTCTTAA +TTGAGGATATTATAATCATAATTTTTTATATTATAATATAAAATTTAACTAGCTAGATAA +TATTATATAAAAAAAAAAAATAATATTATATAAATTAATTAAAATAATTTTTATTAATTG +AAACTGAAATGTTTTAAAGTTAAATAAAAGAGCTCTAATCCATGGTGGTTCAATTAGATT +AGCACTACCTATGTTATATGCAATTGCATTCTTATTCTTATTCACAATGGGTGGTTTAAC +TGGTGTTGCCTTAGCTAACGCCTCATTAGATGTAGCATTCCACGATATTAATTTAATAAG +TGTCGTGCTTAAAATTCACTAAAATAATATATAATAAATTATAATAAATATATAAAAAAA +ATAAAAAAAATAAAAAAAAATTAATATCTTATGATTAATTTTATATAAATAAAAATTTAT +TAAATATTATTGGTTATATATATATATATATTAATAATAAAAAAATATATATATATATAT +AGCTAACGGGGAAACTCTTATAATTATTATTTATATAATAAATAAGACAATCCCGTGATA +ACTTTAATATATATATATTATATATTAAAGTATTGTAGAGACTAAACGTGAATGATTTTA +ATATTATTTAAATATTAAAATTAAGAGATAGTCCAATCTTATATGTAAATATAAGTTAAT +ACCAAAAAAAAAATAATATTATTTTGACTTATTATATATTAATATTATTAATAATAATTT +TAACTAATAATAAAGTTTTTATAGAAACTTTATATTATTATTTAATATTTAATTTTCAAT +TAATATCTCCTTTTGGGGTTCCGGTCCCTGGTCCGGCCCCCGAAACTAAAGATATTAAGA +ATTTATATGAATCAATTATAAATAATTATATTAATATTTTAAATAAATATCTTATTAATA +TTAATAAAGATAATATTAATAAATTAAAATTTTTAGATAATTATACTGAAGAAGAAAAAG +GTTATTATTTATCTGGATTATTTGAAGGAGATGGTAATATTTATACTAGATGTTTTTCAA +TTACTTTTTCTTTAGAAGATGTTTTATTAGCTAATTATTTATGTCTTTATTTTAAAATTG +GTCATATTACAGCTAAATATAATTTTAATAAAGAATTAACAGCTGTTAAATGAAATATTA +TAAAAAAAAAAGAACAAGAAGTATTTATAAATTATATTAATGGTAAATTATTAACATATA +AAAGATATGATCAATATTTTAAATATAATTTTAATAATCGTTTAAATATTAAATTATTAA +AACCTAAAGAATTTGATTTACTATTAAATCCTTGATTAACAGGTTTTAATGATGCTGATG +GTTATTTTTATCTAGGTTTTCAAAAACATAAAAATAGTCAATGATTAAAATTTCATTTAG +AATTATCACAAAAAGATAGTTATATTTTAGTCCGGCCCGCCCCCGCGGGGCGGACCCCAA +AGGAGATATTATTAAAAAATATTTTAAACTTGGTGGTATTTTAAAAAGAGATTATAAATC +TGGTGCTACAGCTTATATTTATAAAGCTCAATCATCAAAAGCTATAAAACCTTTTATTGA +ATATTTTAATAATTATCAACCATTAAGTCTTAGAAGATATAAACAATATTTATTATTAAA +TATTGCTTACTTATTAAAATTAAATAAATTACATATATTACTTAATTCTTTATTAATATT +AAAAGAATTAATATTATTACAAAGTGTTAAAAATATATCTTTAGAAATAAAAAATGAATT +AAATAATAGAGTTAAAATTATTATTAATAAACTTCATTATAACAATATCGAATAATGATA +ATATTAAAGAGTAAAATTCTTAAAGTGTTAATTAAATAATATTCTTTTTTTTTTATGACT +TACTACGTGGTGGGACATTTTCGTGCGGTCTGAAAGTTATCATAAATAATATTTACCATA +TAATAATGGATAAATTATATTTTTATCAATATAAGTCTAATTACAAGTGTATTAAAATGG +TAACATAAATATGCTAAGCTGTAATGACAAAAGTATCCATATTCTTGACAGTTATATTAT +AAAAAAAGATGAAGGAACTTTGACTGATCTAATATGCTCAACGAAAGTGAATCAAATGTT +ATAAAATTACTTACACCACTAATTGAAAACCTGTCTGATATTCAATTATTATTTATTATT +ATATAATTATATAATAATAAATAAAATGGTTGATGTTATGTATTGGAAATGAGCATACGA +TAAATCATATAACCATTAGTAATATAATTTGAGAGCTAAGTTAGATATTTACGTATTTAT +GATAAAACAGAATAAACCCTATAAATTATTATTATTAATAATAAAAAATAATAATAATAC +CAATATATATATTATTTAATTTATTATTATTATATTAATAAAATTTAATATATATTATAA +ATAATTATTGGATTAAGAAATATAATATTTTATAGAAATTTTCTTTATATTTAGAGGGTA +AAAGATTGTATAAAAAGCTAATGCCATATTGTAATGATATGGATAAGAATTATTATTCTA +AAGATGAAAATCTGCTAACTTATACTATAGGTGATATGCCTATCTTTATTTATATATATA +TTATTATTATTAATAATAAAAAAAAAAATTAAAAAAAAGATAGGAGGTTTATATATAACT +GATAAATATTTATTATATTATTTTTTTTTATAATAAATATTAAAAGATATTGCGTGAGCC +GTATGCGATGAAAGTCGCACGTACGGTTCTTACCGGGGGAAAACTTGTAAAGGTCTACCT +ATCGGGATACTATGTATTATCAATGGGTGCTATTTTCTCTTTATTTGCAGGATACTATTA +TTGAAGTCCTCAAATTTTAGGTTTAAACTATAATGAAAAATTAGCTCAAATTCAATTCTG +ATTAATTTTCATTGGGGCTAATGTTATTTTCTTCCCAATGCATTTTTTAGGTATTAATGG +TATGCCTAGAAGAATTCCTGATTATCCTGATGCTTTCGCAGGATGAAATTATGTCGCTTC +TATTGGTTCATTCATTGCACTATTATCATTATTCTTATTTATCTATATTTTATATGATCA +ATTAGTTAATGGATTAAACAATAAAGTTAATAATAAATCAGTTATTTATAATAAAGCACC +TGATTTTGTAGAATCTAATCTTATCTTTAATTTAAATACAGTTAAATCTTCATCTATCGA +ATTCTTATTAACTTCTCCACCAGCTGTACACTCATTTAATACACCAGCTGTACAATCTTA +AGTTATAAAATTTAATTATTTACTTAATAATTAAAAAGTAAATATTATATCTAAACTTAA +TAATATAATAATAATATTCTTATAAAAATATATAAAAAAAAATATATAAAATTTATTAAA +ATATCTCCTTTCGGGAACTATAATATATTTATATAAATAAATACTAATATAATCCTATTA +TATATATATATATATAAAATAATATATATATATAATTAATATAAATAATATTTATAATAA +TTTTTTAATAATATATATAATTTAATATATTAATGAATATTATATAATTATTAAATATAT +TATAATATTATTATTATTTTATAATAAAAATATTTTTAATACTAATTATTATTTATTATT +TATAAATATATAAATAGTATGTTTAATATTATTAATACTAAAAAAAATATAATTATAATT +AGGATCTAACAATACATTTATCTGATTAATATTAATATTAATATTAATATTTATATTAAT +AAACGGATTAAATTAATTGTATCCAATTTAATTAAATTATAGATATATTATTTATAATAT +TAATATATTGTTTTATTAAAAAGGTAAAAATAGTTTTTATTTTATATATAAATATAGGAT +ATAAATAAATATATTATAGTGAACCCCGAAAGGAGAATATATTAAGAATATATTTATATT +TTACATATAATTATTTATAATATAAATATCTCCGCAAAGCCGGATTAATGTAATTATTTA +ATAATTTTATTTAATAATTTATTAAAATAAATATTTACATTTGATAATATTTATATTATG +TCAGTTATTTTATATTAATGTTTAATCTATTATAATATTTTTTTTTATAAATATATTATT +TATTTATATTAATTATATATATATATTATTTTTATAATATATATATATTTTTATTAAATA +TTTATTAAATATTTATTAAATTATTATAATGTTGTTATTAATCTTATTAAAAAATATATA +TAAAAATGCCACAATTAGTTCCATTTTATTTTATGAATCAATTAACATATGGTTTCTTAT +TAATGATTCTATTATTAATTTTATTCTCACAATTCTTTTTACCTATGATCTTAAGATTAT +ATGTATCTAGATTATTTATTTCTAAATTATAATATATATTATTAATTTATTTATTCATAT +AAATATTATTATTATATATAAATATTAATAATATTTATACTTATTTAATAATAATAAAAT +AAAAAATAATTATAATTTAATATATTTAATATATTTCCTTACGGACTATATATTTATATA +TATATATTAAATACAATTTAATTTAATTTAATTATGTTATTTATTAAATAAAGTTATATT +ATGATATAATAACAATATTATATATTATTATATAATTATAATATATTTTAATATAATTAT +CAAAAGAAATAATAAAAAAATATTAATAAGAATATAATTTAATAATTATTAAAAAAAAAT +TCTTATAGTCCGGCCCGCCCCCCCCGCGGGGCGGACCCCAAAGGAGGAGTAATAAAAATT +ATTAAATACAAATATTATATATATATAATTCATTATATATATATATATATAATAATTAAT +CTTATTTTTTTATATATTTATTTATATATCTATTTATATTTTATATATATTTATTTATAT +ATCTAAGGGGTTCGGTCCCTCCCCCCGTAAGTATAATATACGGGGGTGGGTCCCTCACTA +TTTATATTTTTATTTTATATATTTTATATATTTATAAATAAAGTATAATAAGATATAATT +ATGATTAATTATTTATAAGTTATAGTTTTATAAATTTATAATTATTATGTTTAATTTATT +AAATACATATATTACATCACCATTAGATCAATTTGAGATTAGACTATTATTTGGTTTACA +ATCATCATTTATTGATTTAAGTTGTTTAAATTTAACAACATTTTCATTATATACTATTAT +TGTATTATTAGTTATTACAAGTTTATATCTATTAACTAATAATAATAATAAAATTATTGG +TTCAAGATGATTAATTTCACAAGAAGCTATTTATGATACTATTATAAATATGCTTAAAGG +ACAAATTGGAGGTAAAAATTGAGGTTTATATTTCCCTATGATCTTTACATTATTTATGTT +TATTTTTATTGCTAATTTAATTAGTATGATTCCATACTCATTTGCATTATCAGCTCATTT +AGTATTTATTATCTCTTTAAGTATTGTTATTTGATTAGGTAATACTATTTTAGGTTTATA +TAAACATGGTTGAGTATTCTTCTCATTATTCGTACCTGCTGGTACACCATTACCATTAGT +ACCTTTATTAGTTATTATTGAAACTTTATCTTATTTCGCTAGAGCTATTTCATTAGGTTT +AAGATTAGGTTCTAATATCTTAGCTGGTCATTTATTAATGGTTATTTTAGCTGGTTTACT +ATTTAATTTTATGTTAATTAATTTATTTACTTTAGTATTCGGTTTTGTACCTTTAGCTAT +GATCTTAGCCATTATGATGTTAGAATTCGCTATTGGTATCATTCAGGGATATGTCTGGGC +TATTTTAACAGCATCATATTTAAAAGATGCAGTATACTTACATTAAATTATAAAATAAAA +TTATAAAATAAAATAATTTACATATGGAGTATTAAACTATAATAAATACAATATACCCCA +TCCCCCCCTTTTAATAATATTCTTTTATCTAATAAAATATTTATTTATTAATATTATTAT +TATCTTCTTCAAGGACTTATTTAATATATTTAATAACTTATTATACTTATTTATATTTAT +AATTAATACAAATATATTATTAATCTTACTCCTTCGGAGTTCGGCCCCCCATAAGGGGGG +GACCTCACTCCTTCCCCACTGCACTGGATGCGGGGACTTATTTTTATTATTATTATTTAA +TCTTTATTTATAAAATTATATATTATATATAAATTATTATACTTAATAATTAAAAAAAAA +CCTCTAATTATTATTAATATTATATATAATATATATATTCTCATTAATGTTATATATAAT +ATATATATTCTCATTAATATATTAATATAGTATTAAAAAAAATAAAATATTTAATAAATA +TTATTATTAATAATATTTATTAAAAATAATATAACATAATAAATATAAGATTATTATATA +ATATATTTATTATATCATATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGGAAGG +AGAAATTATAACATATTTTTTAATAATATTCATATTTATTTTATATACAAATAAATATAT +TTATTTAGAATAATAAAAAAAAATAATAAATAAATATATTATTATCATTATTATACTTTA +TTCATTATTTATTATAATAATTATATATAACAATTATAATATATAATTATATTTTATATA +ATATTATATTAATATTTAATATATTTATTATTATTATTACTTCTATGGAAACTTTATATT +TTAGATATTTTTATTATTATTATTAATTTATAATGTTATATTTTTGATTTATAAATATAT +AAGTCCCGGTTTCTTACGAAACCGGGACCTCGGAGACGTAATAGGGGGAGGGGGTGGGTG +ATAATAACCAGAATATTCAATAAATACAGAGCACACATTAGATAAATTTTATAATATAAC +CAATATAAAATAAAATTAAAATAATTAATATATATATATAAATATAATAAATTATTATAT +ATAAATATATATAATTTTTATAATAAATATTATAATATTATATAAATAAATAATTATAAT +ATATAATAAATATATAATAATAATAAAAATATTAACAATATAATAAAAATTTATAATATA +AATATAAATTATAAATAAGTTAAATTAATAAAATAATAAATGATTAACAAGAAGATATCT +GGGGTCCCATTAATAATTATTATTTTCAATAATAATTGGGACCCCCCACCATTATAATAT +CATATTAATTAATATAATAATAATGTATATAAAATAGAAATAATAATTAATATAATAATA +ATAATATATATAAAATAGAAATAATAATTAAATATATATATAAATAATTATTTATATAAT +ATATTATAAATAATAATAATAATAAATATTTATTAATTAATAATGATTATAAATATTTTA +TTTAATATAAATTTATAACTATTTTATTATATATATATTTTTTATTCATAAAAATTCCTT +TTGAGGATTTTTATTTTATATAAATATCTTCTAATATTTATAATAAATAATAATATATTC +ATTATATTTATAATTATATATAATGTAATACGGGTAAACATTACCCGTTGTTCACGGGTA +ATGTTTACCCTATTTTATATAATTCTTAATAAATATATTTATATTTTTATATAAAAAAAA +TTATAATAATTTATTAATTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGAACTCCGGAA +CTATAAAAATAATTTTAATATAATTTATATATTTTATGATTAATATAATATATTATTAAT +GTAACTCCTTCGGGATTTGGTCCCCCTCGTAAGTATATAGTATATAGTATATAGTATACG +GGGGGTCCCTCACTCCTTCGGGGTTCGGTCCTCCCTTACGGGTACGGATACGGATACGAA +TATGGGGAGTCCCTCACTCCTTATCACTACGCTGAAGGTGGAATTTATTTTATATTATTA +TTAAATCTTTATTTATTTAATTATATATTTAATATATATATTATTATAATAAAACACCTA +ATTATTATTAATGTTATATTTAATATAATATATATATTCTTAAAAATTTATATAATATAA +ATAAATAAAAAAAAAAGAAAGTACATAATTAATATTATTATAAATAATATTATTAAAAAG +AATATAATATAATTAATAGAAAGACGTTTTAAAAATAAAAATAAAAATAAAAATAAAAAT +AAAAATAAAAATAAAAATAAAAATAAAAGAGTTTTGGTTTACATATCAAGACCCAATTCA +ATTGAAACTATTTATTTATTAATCTCCTCCCCTCCCCCTCACTATTATTATAAGTACAAT +TAGGGCGCCAACCCCGCAGTGTTATTTACTGGGAAATGTTTATCCCAATTAATATAATAA +CGAGAGTTATTAATTATTATTTATAAATTCATATAATGTAATATAATGTAATGTAATTAA +TAGAACATTATTGTGTTATTCACCAGTGTTAAGATATATTAATCCCAATTTTATTTAATA +GTGAAGATTATATTTTATTAATTATGAATCCATATTATTATTATTTAATATATTTATAAT +ATTATATATAATTATAATTATAAATAATTTATATAAAAAAAGTTTTATTAAAAAATATTA +TTAAAAATATAATATTAATAATAAATAAAAATAATATTATACTCTTAATAGAATTTATAA +TGATAAAAATTAAGATGAAGACTTTTTTTTATAATTATTATAAATTTATATAAAAATAAT +ATATATATATTTATATTTATTTTATTAATATATATAATATATTTATGTATATTAAAAAGA +TATATTTAAATATTTTTATTTTTTTTTTATAAGATAATTTTTGTAAATATATAAGTAATA +AATTAAGTTTTATAGGGGGAGGGGGTGGGTGATTAGAAACTTAACTGAATAATATATATA +AAGCATACATTAGTTAATATTTAATAATATAATCAATATATAATAATTATAAAATAATTA +ATTATATAATAATAATAATGTATAAACAATATAATAAATTGTATAAAATAAAATATAAAT +CATAAATAAAGCTAAATTAATAAAATAATAAATGATAAACAAGAAGATATCCGGGTCCCA +ATAATAATTATTATTGAAAATAATAATTGGGACCCCATATAGAATATAAATAATTAAATA +TATATATATAAATAATAATTTATATAATATATTATAAATAAATAATAATAAATATTATTA +ATCTATAATAATTATAAATATTTTATTAATATAAATTTAATAATTATATATATTTTTATA +ATAACTCCGAAAGAGTAAGGAGATATTAATTTCTTATAAAAATTTATTAATAATAATAAT +ATATAAAATATATAAATAATATATTATATATAAAATAAAATAAAATAAATAATATATTAA +AAATATTGAAAGTATTTTAATAAATAATAAATTTAAAATTCATATTTATAATAATAAATA +AATAAATAAATAAATAAGTAAATATTTAGATTCTCATTAATATTAATATTTATATTTCTT +TTTTTTTATAATAATAAAAATATCATATATAAATATAATATAATATAATATAATAAATTA +TTATATATAAATAATAAATATTAAATATAATATATAATAATATATAATCTTACAATTTAT +AATTTAATAAAGAAGGAAATAAATAATAATAACTCCTTTTGGGGTTCCGGTGGGGTTCAC +ACCTTTATAAATAATAAATAAAGATGTTTACTCCTCTTCGGGGTTCCGGTCCCCTTTTTG +GGTTCCGGAACTAATTAATATTTTATATAATAATAATAATATATTAATATAATTTCATTA +TTAATAAATATCTCCTGCGGGGTTCGGTTCCCCCCCGTAAGGGGGGGGTCCCTCACTCCT +TCGGAGCGTACTATTATTATAAATAATTATATATTATAATATAATTAAAAAGTATTATAA +TTGAAACGAAAATTGTAATTTTAAATGGAATAATAATTATTATATATTTAATATATTTAA +TAAAGTTATAATATCTCTTTCTACCGGACTATTTTATTTTATTTTATTTTATTTTTATAA +AGAAAAATAGTAATAATATTATCTTCTCCTCCTTTCGGGGTTCCGGTTCCCGTGCCGGGC +CCCGGAACTATTAATTATATAATATAATATAATATAATATAATATAATATGATACGGATC +AAACATTACCCGTTGTTCACTGGCAATGTTTAATCCTATTGTATATAAATATAATAAAAT +AATTATCCCTCTCGTAATACATATATAAAATATAAAATATAAAATAAAAATATTATGATT +ATTATAATATATATATATATATATATAAATATATATATATAATTTATAATTTATATGATT +AATATATTATATATATAAAAAATATATTAAATTTACTTTTTATAGAAAGGAGTGAGGGAC +CCCCCCCCCTTACGGGGGGGAACCGAACCCCGCAGGAGATATTTATTTTAATACTTATAT +AGTATTTATTAATAATATAATAATTGTTATTATAAATATTAATAATAATATAAAAATAGG +GTAAATAATATAAATAATATGAATAAATATAAAAACATATTAAATATAAAATATATCATA +AATTTAATAAATATTATAATAATTTATAAATGATAGATATCTGGGGTCCTATAAATAATA +ATTATTTTCAATAATTATAGGGACCCCCACCTATTATATAAATATAAATATAAATATAAA +TATAAATACAAATATAAATATATAAATATATAAATATAATATAAATACAAATATAATATA +TAAATATAAATATAAATATATAAATATAAGTCCCCGCCCCGGCGGGGACCCCGAAGGAGT +GAGGGACCCCTCCCTATACTAATGGGAGGGGGACCGAACCCCGAAGGAGTATAAATAAAA +ATTAATAATATATATATAATTATAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCC +GAAAGGAGAAATAATAATATAATATATAATAAAATATAACTTATTAATATAATATTAAAA +ATATAATTAACAAGAATAAATAGTCCGTGGGATCGAACCCCCTTTTTTATTTAATATTTA +ATATTTAAAGAAGGAATTGTTTATATATATTAATATCTTATTTGGGGATTAATATAATAT +ATAAGTTTTGGATACCAGGCCAAAGACCGGAATCCCAAAAGGAGATTATATAAATATTAT +TTATCTCCCTTTTTTAATATTATAATAATTTTATTAAAAATAAAATAATAATAATAATTA +TAATTTATAATAACAATTATAATAATTTAATTAATTAATTAATTAATTAATTAATTAATT +AATTAATTAATAATAAATATAAATATAAAAAGAATATAATTTATAATAAATAAATTTATA +TATATATATATATATTAAATAAAATATTTACTTCATTAATATAAAATATAAATATATTTA +ATTAATAAGTATATATATATAATAATATATAATAACCTATTTATATATATAATCTTAATA +TAATTATAAGAAATATTATATAAGTAATATATAAAAATAATATAAAATAATTATAATTCA +ATTTATATATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGGAAT +AAGATAAATATATAAATTATATTAATAAATATAAATTTTAAATGAATTAATAAAATTAAT +ATATATATGTATATATATATATATATTAAAAATATTTAATTATTTTTAGGAAGGAGTGAT +AGATCCCTTTGGGGGACCGAACCCCTATTTAAGAAGGAGTGCGGGACCCCGTGGGAACCG +AACCCCTTTTTTATTTAAAGAAGAAGTTTTATTTTATTTTATTTTATTTTATTTTATTTT +ATTTTATTTTATTTTATTTTATTTTATTTAATTTAATTTTAATTAGGTTAATAAATAGTA +ATAATAAACTTAATAATAATAATAATAATTTTATTTTTATAATTTATTAATAATAATAAT +AATTATATATATATATATTATTAATAAATATAGACCTTATCGTCTAATGGTTACGACATC +ACCTCTTCATGTTGATAATATCGGTTCGATTCCGATTAAGGTTATTCATAATAATAAATA +TTTGTAAAAAAAGTATATATAATTAAACATATTCTTTATATTAATTAATAATTATTAATA +ATATACATTTTATATAATACAATTATATATATATATATATTTTTTTTTAATACAAATAAT +ATATTCATAATAATAAATACCGATTGTTATTATACTATAATAAAATATATAATATATTTT +TCATTATAATATTTTTAAATAAATATTATAATAAATTATATAAATAATATTTATGTATAA +TAATAATAATAATAATTGTTATTAATTAATTCTATAATTATTATATATTTAATTTTTTTT +TTTAATATAATATATAATAATATAATTTATTTTATTTTTTTTTATAGTTCCGGGGCCCGG +TCACGGGAGCCGGAACCCCGAAAGGAGAATATAAATTAATAATAATATAAATAACATATT +AACAATAAATTATTGTTAATATAATAATAATAATAACAATATTAATAAATAATATAAAAA +TTATTAATATTATATTTATATAATATTAATATAAAAATCTTTCATAATATTAATTATTAT +TAAATAATAATGATATCATTAATATTAATATAATCGTCAATATTATTTATTTATTTATTT +ATTTATTTATTTATTTATTTATTTATTATTAAATAAATATTTTTAAATATTATATTATAT +TATTAACTTTTTATTAAAAAAATTAATAATGATATAATATAATTAATATTATCCACGGGA +CCAATGACCAACCCAGTAGTTGACCGGATTGGCGCCCGCGAGGTTTATATTTAATAAATA +ATAATAATAATATTAATAAAATCTATTAACTTTTTTTTTTAATGGATTATATTAATGAAA +AAAAAATGAGAAATATCTTTTTTTTTTAATAATTATAATTTATATATAATAAAATATGTA +TATATAATAAAAAAATAGTTTTTAATATTATAATATAATTATATATATAATTATAAATAT +ATATATATATAATAAGTATTAATTAATAATATATATTTATATATTTTTTATTAATTAATA +TATATAAAATATTAGTAATAAATAATATTATTAATATTTTATAAATAAATAATAATAATA +TGGCATTTAGAAAATCAAATGTGTATTTAAGTTTAGTGAATAGTTATATTATTGATTCAC +CACAACCATCATCAATTAATTATTGATGAAATATGGGTTCATTATTAGGTTTATGTTTAG +TTATTCAAATTGTAACAGGTATTTTTATGGCTATGCATTATTCATCTAATATTGAATTAG +CTTTTTCATCTGTTGAACATATTATAAGAGATGTGCATAATGGTTATATTTTAAGATATT +TACATGCAAATGGTGCATCATTCTTTTTTATGGTAATGTTTATGCATATGGCTAAAGGTT +TATATTATGGTTCATATAGATCACCAAGAGTACTATTATGAAATGTAGGTGTTATTATTT +TCATTTTAACTATTGCTACAGCTTTTTTAGGTTATTGTTGTGTTTATGGACAGAGTGAGA +CAAGTATAAGTATATTATTATAATATCATACCATTAAATAAATTATTTTAATGAAATGAT +TATGTTTATATATAACATATACCTAATTAGACATGCATTATTAGTAATAATTTTGTATGA +AACTCTAATAATAATAATTATTATTAATTATTAAGGTAAGATTCATATGGATAGCGTAAG +TCAATCTAATATTATAAAATATCGTAACATAAACAATATTTTTTTCTATTATTAATTAAT +AAATAATAATAAATAAAAATAATTATATGAGAAGTAAGATATTCAATTCTGTCTAGAATA +CATATATATACGTTAATACTCATCGGTATAAAATTAGAATCCTAAGTGAATTATTGAAAG +TATAATAATATAAACTTGGTAAGCCCAATTATTTCCATATAATATTAATATAAATATTAT +ATGGTAGTTATATATAATATTATTAAATAAATAATAATAGAAATTATAATATAGATAAGT +GGGTAAAAGACTATTGAAAAAGCTAAAGATTATATGTAATGTATAATATAGATCAAATTA +TTTATATATTTTAATAAAAATATATTAATAATGGTTAATATTATTATTAATTAATTAATT +AATTAATTAATAATAATAACGAATAAATGATTAATGTGAAAGCATGCTAACTTCAATATA +GGATGATTTATATAGTATATAAATTGTTTGAGCTGTATACTATGAAAGTAGTACGTACAG +TTCTGAGTGGGGGAAAATTTGTAAAGATCTACCTATCACAATTGTCACATTGAGGTAATA +TAAATATCGCCTCAAATATATTTAATATAATAAAACTAATTTATATAATAATGTTAATAT +TATTAATTTATATTTTTTATACGATTATAATAAGACAAATAATAAAAACTAAAGAATATC +TTATATTAATTAAGAGTATAGATTATATTAATAAAAATAAATATATAATTAATTTAAATA +TAACAAATAAGAAAGATATAAATAATAATATTGGTCCATTAAATATAAACATTTTATCAA +TTATTTATGGTTCAATATTAGGAGATGGTCATGCTGAAAAAAGAAAAGGTGGTAAAGGAA +CAAGAATTGTATTTCAACAAGAATATTGTAATATTAATTATTTATATTATTTACATAGTT +TATTAGCTAATTTAGGTTATTGTAATACTAATTTACCTTTAATTAAAACTAGATTAGGTA +AAAAAGGTAAAATTAGACAATATTTAAAATTTAATACATGAACTTATGATTCATTTAATA +TGATTTATTCAGAATGGTATATTAAAAATATATCTGGAAAAGGTAATATTAAAGTTATTC +CTAAATCTTTAGACAATTATTTAACTCCTTTAGCTTTAGCTATTTGAATTATAGATGATG +GATGTAAATTAGGTAAAGGTTTAAAATTCACAACTAATTGTTTTAGTTATAAAGATGTTC +AATATTTACTTTATTTATTACATAATAAATATAATATTAAATCTACTATTCTTAAAGGCA +ATAAAGAAAATACACAATTTGTTATTTATGTATGAAAAGAATCTATACCTATTTTAACTA +AAATTGTATCTCCTTATATTATTCCTAGTATAAAATATAAATTAGGTAATTATTTATAAT +AAAATATATAGTATTATATTAATTATTATATTATTATAATGCGATATTATTGAAAACATG +TCAAAATTATATTATTAAGTAACAAGACAGTGGGTTATATAATTATATGATCCCAACAGA +ATACACCAATAATAGGTATTATTATAAAAAAAATAATAATATTTAATGTTTATTCGAAGA +AAATTTATAATATTATTATTATAACACAAGGTTTAATAATCTATATATATATATTATATA +TATAACTACTGTTATTATTCCATTTACCTAATTAATATATAAATAATGAATTATAATTAT +TATGATTAATATTTTTATAATAATAACCCCATCATAACATTTATATATAACATTTATATA +TAACATTTATATATAATATTTATATTATGGTATTATTAGGTATAAATATTTATTCATAAG +AGAAAATAGTGATTAAATGGAATTATAAAAAGGGTAGATATTATTAAATACAGGGTATTA +TTTATATTAATAAATCAATAAATATTGAGATTATTATTATTAAAAAATAATAATAATTTA +TAAATAATATTATTTTCTTGGCACTAGTTATTACTAATTTATTCTCAGCAATTCCATTTG +TAGGTAACGATATTGTATCTTGATTATGAGGTGGGTTTAATATAGAGGATCCATATTATA +GTAATATAATATTAAATAAATCTGTTTTATGCTGAAATATCTTCATTTGAATAATAAATT +ACTATATTATTCAATTAATTATTTATAATAATATAATTTGAAATAAAAATAATATAGTTA +AAATATTTATTATAAGAAGAAAATTAGCAGTAATTAATATATATATATATATAAAATTAA +TTATTCAGAGACTTTATAGTTATTATATAAATAATACTATTATTTATGATAAAAATCATA +AATTAAACACAGATAATCCTATTTATGCATATATTGGTGGTTTATTTGAAGGAGATGGTT +GAATTACTATTTCAAAAAAAGGTAAATATTTATTATATGAATTAGGTATTGAAATACATA +TTAGAGATATTCAATTATTATATAAAATTAAAAATATTTTAGGTATTGGTAAAGTAACAA +TTAAAAAATTAAAAATAAAAGATGGTACTATTAAAGAAATATGTAAATTTAATGTAAGAA +ATAAAAATCATTTAAAGAATATTATTATTCCTATTTTTGATAAATATCCTATATTAACTA +ATAAACATTATGATTATTTATATTTTAAAGATAATTTATTAAAAGATATTAAATATTATA +ATGATTTATCTTATTATTTACGTCCTATTAAACCATTTAATACTCTTGAAGATATTTTAA +ATAAAAATTATTTTTCTTCATGATTAATTGGTTTTTTTGAAGCTGAAAGTTGTTTTAGTA +TTTATAAACCTATAAATAAAAAAATAAAACTTGCTAGTTTTGAAGTATCTCAAAATAATA +GTATAGAAGTTATATTAGCTATTAAATCATATTTAAAAATTACTCAAAATATTTATACAG +ATAAATTTAATAATTCAAGAATAACACTTAAAAGTATTAATGGTATTAAAAATGTTGTAA +TATTTATTAATAATAACCCTATTAAATTATTAGGTTATAAAAAATTACAATATTTATTAT +TCTTAAAAGATTTACGTCTTATTCTTAAATATAATAATTATTTTAAAATTCCTCCTAAAT +ATTAATCTTATATAAAAATATAATAATAATATATTTATATATTATATAATTATATAAACA +AAATATAATTTATATATAATTATTTATTATAAATATAGTCCGGCCCGCCCCGCGGGGCGG +ACCCCGGAGGAGTGAGGGACCCCTCCCTATTCTAACGGGAGGGGGACCGAACCCCGAAGG +AGTTTAATTATATATTAAATATATTATTATCAATAAATAATTCCTTTGAACTATTTATTA +TTTTATTATATTTATTTTCTCCTTCATTATTAATTTTTATTAATAATTAAAATCTTATCA +TTTTATGGTATTTTTATTTCTATTTTAGGATATCGAAACTATAAATTAAAAAGTATAATT +TTATTAATTATAATTTATGATTAATAAATAAGAAATAAAAACTTTAGAAGTAATATTTAT +CTTTTTTTTTTATAAATAAATATTATGATTAATATATAATCATTTATAAATATTTATATA +TAATTATATATATACATAAATAGGATTAAGATATAGTCCGAACAATATAGTGATATATTG +ATAATAGTTTTCAAATATGTAACTATTTAAACATTAAAAGCTCAGTATCTAACCCTCTAA +TCCAGAGATTCTTTGCGTTACATTATTTAGTACCTTTTATCATTGCTGCAATGGTTATTA +TGCATTTAATGGCATTACATATTCATGGTTCATCTAATCCATTAGGTATTACAGGTAATT +TAGATAGAATTCCAATGCATTCATACTTTATTTTTAAAGATTTAGTAACTGTTTTCTTAT +TTATGTTAATTTTAGCATTATTTGTATTCTATTCACCTAATACTTTAGGTCAAAATATGG +CCTTATTATTAATTACATATGTAATTAATATTTTATGTGCTGTATGCTGGAAATCTTTAT +TTATTAAATATCAATGAAAAATTTATAATAAAACTCTATATTATTTTATTATTCAAAATA +TTTTAAATACAAAACAATTAAATAATTTCGTATTAAAATTTAATTGAACAAAGCAATATA +ATAAAATAAATATTGTAAGTGATTTATTTAATCCCAATAGAGTAAAATATTATTATAAAG +AAGATAATCAGCAGGTAACCAATATAAATTCTTCTAATACTCACTTAACGAGTAATAAAA +AGAATTTATTAGTAGATACTTCAGAGACTACACGCACACTAAAAAATAAATTTAATTATT +TATTAAATATTTTTAATATAAAAAAAATAAATCAAATTATTCTTAAAAGACATTATAGTA +TTTATAAAGATAGTAATATTAGATTTAACCAATGATTGGCCGGTTTAATTGACGGAGATG +GTTATTTTTGTATTACTAAAAATAAATATGCATCTTGTGAAATTCTTGTAGAATTAAAAG +ATGAAAAAATGTTAAGACAAATCCAAGATAAATTTGGTGGTTCTGTAAAATTAAGATCAG +GTGTTAAGGCTATTAGATATAGATTACAAAATAAAGAAGGTATAATTAAATTAATTAATG +CCGTTAATGGTAATATTCGTAATAGTAAAAGATTAGTACAATTTAATAAAGTATGTATTT +TATTAAATATCGATTTTAAAGAACCTATTAAATTAACTAAAGATAATGCTTGATTTATAG +GGTTCTTTGATGCTGATGGTACTATTAATTATTATTATTCCGGTAAATTAAAAATTAGAC +CTCAATTAACTATTAGCGTTACAAATAAATATTTACATGATGTTGAATACTATAGAGAAG +TATTTGGTGGTAATATTTATTTTGATAAAGCTAAAAATGGTTATTTTAAATGATCTATTA +ATAATAAAGAATTACATAATATTTTTTATCTTTATAATAAAAGTTGTCCTTCTAAATCTA +ATAAAGGTAAACGTTTATTTTTAATTGATAAATTTTATTATTTATATGATTTATTAGCTT +TTAAAGCACCTCATAATACTGCTTTATATAAAGCTTGATTAAAATTTAATGAAAAATGAA +ATAATAATTAAATTTTCTCCGTATTCATTATTATATTATCTAATTTATAAAATATTTAAA +GATTCCTTATAATAATATAACATCTTTGTAAATTATTGTTAAAGATAATATAAATTATTA +TGAATCGGTAGATTATATTTTTACAATCTTATTAAATAAAATTCTGATCATTAAACATGA +TTGAAGAAATAATAATAGTTTATGAAATAAGATAGTGTAATATAAATTTTTATGAAGATA +TAGTCCATTTTATATTTATTATAAAAGCATCCTGATAACTATATTCCTGGTAATCCTTTA +GTAACACCAGCATCTATTGATATTAAAAATATTAATAAAATTATTATTATTTAATCTTAT +TTATTTTATATAAAAAAAATAAATAATAATTATTAATAAAAATATATTATTTATTTCTCC +TTTCGGGGTTATTTATATATATTCCTTTATAATTTATATTTAATATATTATATTAAATAT +ATGAAAAATTATAATAAATAAATTAATTAATTAATAATAAATAATAATAAAAAGTACAGT +AGCATTAAATATTCTTAAGTTTCCGCTTTGTGGGAACTCCCATAAGGAGTTTAATGATTA +AAATTGGTTAATTGTCAAGAAAATCTAAGGTATTAATAAATAAATAATACTATGACAACT +TGCAGCGAAGTTTATATCATCTCTATATTATATATTAATATATATATATAATAATAATAA +TAATATTAATATAATATAAGATATAAAAACGTTCAACGACTAGAAAGTGAACTGAGATAG +TAATACCTTTCCACGAAAACCAATTAATTTATAAATTATTTTTAAATAAAGAATAGATTA +TTAATTTTTTTTATATAGTTCCGGGCCCCGGCCACGGGAGCCGGAACCCCGGAAGGAGTA +ATATATATTATATATAAAATAAAAAATATATATATATATATTATAAAATATCAAAAGTTT +TAATCTTTTATTATAAATTAATGACATAGTCTGAACAATAATGAAAATTATTGAGATAAG +ATATTAAATAATCTTATGTTAACATATATAAATTGTGTACCTGAATGATACTTATTACCA +TTCTATGCTATTTTAAGATCTATTCCTGATAAATTATTAGGAGTTATTCTAATGTTTGCA +GCTATTTTAGTATTATTAGTTTTACCATTTACTGATAGAAGTGTAGTAAGAGGTAATACT +TTTAAAGTATTATCTAAATTCTTCTTCTTTATCTTTGTATTCAATTTCGTATTATTAGGA +CAAATTGGAGCATGCCATGTAGAAGTACCTTATGTCTTAATGGGACAAATCGCTACATTT +ATCTACTTCGCTTATTTCTTAATTATTGTACCTGTTATCTCTACTATTGAAAATGTTTTA +TTCTATATCGGTAGAGTTAATAAATAATATATAATTAAATTAATACATAGATATAATATA +TATATTATTATTATTAATAATATAATAAAAATAAAAATAAAATTATTAATAATAATAATA +CTTTAATAATATTCTTAAAAATAATATATCTCTAATTTATAAAAATTAAATAATAATAAT +AAAAAAAAAATATTATAAAATATAAATTAATTAATAATGAAAATAATATACTTATTAAAT +TAATATAAATAAATGAATAATATAATATAACTATATTGAATTATAATCTATCTATCTTTT +TTTTTCATATAATTATAATATATATATTAATATATATAATTATTATTTTATATATTATAG +TTCCGGGGCCCGGTCACGGAAGCCGGAACCCCGCAAGGAGATTTATTAATTATTATTATC +ATTATTATTTTTTATTTAATCTTATTTATTATAAAATAATTAATTATCATAAAGCATAAT +TATTATAGAATCTTATTATTTTCTTTATTTAAATTTATAAAAATATAAAGTCCCCGCCCC +CTTTTTATTTTATTTAATTAAGAAGGTATTTTAAAAAAGGAGTGAGGGACCCCCTCCCGT +TAGGGAGGGGGACCGAACCCCGAAGGAGTACTCATTTAATATAAATATTAAATAAAAATT +ATTTTATATATATTAATGATTATTAATATTGATAATATAAATTATTTTATAATTAATTAT +TATAAATATATAACTATTAATAATTAATTTTTAATCTAGGGGTTTCCCCCACTTACATAA +ACTTACGTATACTTACATATACTTATGTATACTTACATATACTTACGTATACTTATATAT +ACTTATGTATACTTACGTATACTTACATATATGGGGGATCCCTCACTCCTCCGGCGTCCT +ACTCACCCTATTTATTAATCATTAATAAGAAATTATTATTAAAAAAATTATAATTTACTC +AAAGTTAATTATAAATATATTTTTAAATATCTATTTTATTAATCTTTTATAAAATTTAAA +TTAATTGTAATTAATTAATATTATAATAATTATTCTTAGGAAGGATATTTATTTATTTTA +ATTATGAATTCCTGACATAGAGACAATTAATTAGAACTTCTTATTATTATTATAGTAATA +ATAAAAATATTCTAAATATATTATATATATTATTATTTTTTTTATTATTAATAAAATATT +ATAATAAATTTAAATAAGTTTATAATTTTTGATAAGTATTGTTATATTTTTTATTTCCAA +ATATATAAGTCCCGGTTTCTTACGAAACCGGGACCTCGGAGACGTAATAGGGGGAGGGGG +TGGGTGATAAGAACCAAACTATTCAATAAATATAGAGCACACATTAGTTAATATTTAATA +ATATAACTAATATATAATAATTATAAAATAATTAATTATATAATATAATATAAAGTCCCC +GCCCCGGCGGGGACCCCAAAGGAGTATTAACAATATAATATATTGTATAAAATAAATTAT +AAATATTAAATAAAAACCAAATAAATAATATAATAAATGATAAACAAGAAGATATCCGGG +TCCCAATAATAATTATTATTGAAAATAATAATTGGGACCCCCATCTAAAATATATATATA +ACTAATAATATATTATATATATTAATATATAATAATATTATTAAAATATAATATTATTAA +AAAAAAAGTATATATAAAATAAGATATATATATATAAATATATATATTCTTAATAAATAT +TATATATAATAATAATAAATTATTTCATAATAAATTATTTCTTTTTATTAATAAAAATTA +CTTATCTCCTTCGACCGGACTATTAAATATTAAATATTTAATATTTAATATTTAATATTT +TATTCTATAGATATTCATATGAAAAATAATAAGTATATAATTATGATAATGAATATATTT +TTATTTATAATTTATTATTATAAAAATATTTTAATTTAATAATAATAATAAATCATTATA +TTAATTCTTTTAAGAATTTATAATTGTCATTATTTATTATATACTCCTTATTAAAAGGGA +TTCGGTTTCCCTCATCCTCATGGGTATCCCTCACTCCTTCTGATAATTAATTTTATAATA +ATAATAAAATAAACTTAATTAAATATTATATATTTATTTACAATTATATATATATATTAC +TCATAATTAAATTAAATTAAGATGCAATTCAATACGGTTGTATTATATTATTCATCAAAT +ATTGTTAATATTGATACCTACAGAGATATTTAATATTTTTATTATTATTATCCATTACTT +TTTTTATTATATTTTAATTATTTATTTATTTATTTATTTATAATAATAATATTTCATATT +ATCAATTATTATTTTTTTTTTTTATAATATATAATTAAATTATTTATATAGTTCCCCGAA +AGGAGAATAAATAAAATATTATATAAATATTTATATCTTTATTAATATTAATATAAGTAA +TATATATAGTTTATGATATTTAATTTTATCATAATATAATAATAATTATATAAATCTTAT +ACACATTTATATAAGTATATATATATATTATTAATATAATGAACATCTATTAAATAAAAT +AATTGTAAATCTCAAGTAAATTATTATTATTTTATTTTTAATAATAATTTATGATTTATA +ATTAATAAATAAAAGAGTAATTATATGATAAAAAAGGTAATAAATAAAATTTATAGTTCC +GGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGTTTATTTATATATATATATATATG +AATTAATATTTAATAATAAATAATAATATAATTAATAATATTATTATTATTATAATTTTT +TTATTTATAATATTAATAAAATATTATTATATATATATTATAATAATATTAATAAGATAT +ATAAATAAGTCCCTTTTTTTTTATTTAAAATAAAGAAAGAATAATTAAATAATATTTTAA +TAATTTAATTAAATAGTGTATTAAAAGATAATAAAAAGTAATATTAATATGTTAATTATA +TATAATATATTTATATATAATTATATATATATATATAAATAATAATAAATATATATATAA +TATAAAAATAAGAATAGATTAAATATTTAATAAATAAATATTATGCAATTAGTATTAGCA +GCTAAATATATTGGAGCAGGTATCTCAACAATTGGTTTATTAGGAGCAGGTATTGGTATT +GCTATCGTATTCGCAGCTTTAATTAATGGTGTATCAAGAAACCCATCAATTAAAGACCTA +GTATTCCCTATGGCTATTTTAGGTTTCGCCTTATCAGAAGCTACAGGTTTATTCTGTTTA +ATGGTTTCATTCTTATTATTATTCGGTGTATAATATATATAATATATTATAAATAAATAA +AAAATAATGAAATTAATAAAAAAATAAAATAAAATAAAATCTCATTTGATTAAATTAATA +ACATTCTTATAATTATATAATTATTATAAAATATATAAATATTATAATAATAATAATATA +TATAAATTATAATAAAAAATAATAATAATATATAATATACCTTTTTTTTAATATATTAAT +ATATAAATAAATAAATAATGGATAATATATAATTACTTTTTTTATATTATTAATAATAAT +AATTTATAAATATTGTTATAATAAACATTTATATAAATAAATATAAATTACCATAATAAG +ATATATTATTTATTAATAATAAAAATATTTATTAATAAATAAGAAATATATATATTATGA +TAATATTTATTAATAAATAATAAATTCTTTATATATAAATATATTAAATATATTTAATTG +AACACAATATAATTTTTATTGTATTATTCATTTAATAATATTAATATTAATATTAATATA +ATATTAGTGAACATCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGGCCCCGGAACTATTA +ATATTTAATAAAATATATATAATTTATAATTTTCATATAATTAATATAATAATTAGGTTT +ATAAATAAATTATAATATATTATAACAATATAATAAAATATATTATAAATCTATCTATCT +ATCTATATAATATATAAATTTATATATACATTAATAATATTTAATTATAATTATTTAAAT +ATTTAATTTATTAATATTCCCCGCGGGCGCCAATCCGGTTGTTCACCGGATTGGTCCCGC +GGGGTTTATATTATTTAAATATTAAATATTAAATAATAATTTATATTATATTAATAAATA +TAATAAATTAAAAATATATGATTAATTATATAATAATAATAATAATTATTTTAATATTAT +AATTTATAAAATTAATTATATTAATTATATTAATTCTTATTATATAATAATTATTAATAA +TAATTTATTTTAAGAAAGGAGTGAGGGACCCCCTCCCGTTAGGGAGGGGGACCGAACCCC +GAAGGAGAAAATAAATTAATAAAAGTTTAAAAGTTCTTATATTAATAATTATATAATATT +ATATTAAAGATTTTTATAATATATATATATAATATATTTATAGTTCCGGGGCCCGGCCAC +GGGAGCCGGAACCCCGAAAGGAGTTTATTTAATATTTATATTTATATTAATATTTATATT +TATATTTATATTCCTCTTAAGGATGGTTGACTGAGTGGTTTAAAGTGTGATATTTGAGCT +ATCATTAGTCTTTATTGGCTACGTAGGTTCAAATCCTACATCATCCGTAATAATACATAT +ATATAATAATAATTTTAATATTATTCCTATAAAAATAAAATAAATAAATAAATAATAATA +ATTAATTAATTAATTAATTTTAATAAATATAAAATATATAAAAATAATAATAATAATAAT +TATTATTTTAATAATATTATTTATATAATAGTCCGGTCCGACCCTTTTTATTCTTAAGAA +GGGATTTTATTTTATTAATTAATAATAATATATTAAAAATTATAAATAATTAATAATTCT +TTATATTTATATATATATATATATATTTATATATTTATATATATATTTTAATAATATTAT +GATATATTTTATTTTAATAATATTTTTATTTTTATATATAAAATTATAATATTTTATTTT +ATAAATTATTTATATATAAATTATTAATAATAATTATTTTTTTTTATTTGGGATTTATAT +TATTATTATAAAGAATATAATGTTATTAATAACTGCAAAAAATATCTAATATATTATTAT +TTATAATAATAAATAATATTATAATAAGGATGCATATTATATATATATATATATTTCTAT +TTATATTAATATTAATATTAATATGTATATATAATAGATAAAAAGTAAAAATAAAAAATA +ATGAAATTAAAATTATTAAATATAATTTTATCAATAATAAATAAACTTAATAATAATAAT +AATATTATTATTAATAATCTATTAGATTCATTAATAAATAAGAAATTATTATTAAAGAAT +ATATTATTAGATATAAATAATAAAAAAATAAATAATATAAAAAGAATATTAAATAATAAT +AATATAAACCCCGCGGGCGCCAATCCGGTTGTTCACCGGATTGGTCCCGCGGGGAATATT +AATAATAAATTACAACATTTAAATAATATAAATAATTGAAATCTACAAATTTATAATTAT +AATAAAAATATAGAAATTATAAATACTATAAATGATAAATTAATTAATAAATTATTATAT +AAAATAATAACTTTAAAATTAAATAATATAAATATTAATAAAATTATTATAAGTAAACTT +ATTAATCAACATAGTTTAAATAAATTAAATATTAAATTTTATTATTATAATAATGATATT +AATAATAATAATAATAATAATAATAATAATTATTATATAAATATAATAAATAAATTAATA +AATATTATAAATAATAATATAAATAATAATTTATGTAATATTTTAAGTTATTATTATAAA +AAAAAAGTAACTATTGAACCTATTAAATTATCATATATTTATTTAAATAGTGATATTTTT +AGTAAATATATTAGTTTAAATGATATAGATAAATATAATAATGGTATCTTAACTAATTAT +CAACGTATATTAAATAATATTATGCCTAAATTAAATGATCATAATATTTCTATAAATTAT +ATTAATAATATTAATAATATTAATAATAATAAATATAATAATATAATTAATTTATTAAAT +AATAATAATAATATTAATAATAATAATAATTATAATAATAATAATAATAATTATATTGGT +AATATTAATAATATTTATAATAATATAACTATTGATAATATTCCTATAGATATTTTAATA +TATAAATATTTAGTTGGTTGATCTATTAAATTTAAAGGTAGATTAAGTAATAATAATGGT +AGAACTAGTACACTTAATTTATTAAATGGTACTTTTAATAATAAAAAATATTTATGAAGT +AATATTAATAATAATTATAAATTAAATTATATCCCTTCTAATCATAATTTATATAATAAT +TCTAATATTAATAAAAATGGTAAATATAATATTAAAGTTAAATTAAACTTTATTTAATAT +ATATATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGAAATAAAA +TAAATATAATAAATAAAATAAATAAATAAATAATATATATATATATATAAATATATAAAA +TAATATTTACTTTTTATATATATATAATTATATATAAATAAAATATAATATAATATCATA +TAATTATATAAAAATAAAATTATAATTTATTTATATTAAAAATATTAATTAATTAATTTT +TTTATATAATTATTATAATAATAATTTAATTAAAAATAAATATCAAATAAAATTATAAAT +TAATCCTACTTTTGGATCCTATTTATATTTTATTATTATAAATAATTATTATTGATAGTT +AATTAAATAAAAATATATATATATATTACTCCTTCGGGGTCCGCCCCGCAGGGGGCGGGC +CGGACTATTATAATTATTATTAATATATTAATTATTAAATTATATAAACCGCCCCCGCGG +GGGCGGTTAGTTATTTATATTAATATATTTTATATTAATATATAATACTCTTTTTTCTAT +TATATTTTAATATATAATATTAAAAAAAATAAATAAAATAATATTCTTAATTTTTATTCT +TTATCTTCTTTAACCAAACTCCTTCGGGGTTCGGTCCCCCTCCCATTAGGTTAGGGAGGG +GGTCCCTCACTCCTTCGGGGTCCGCCCCCCCCCGCGGGGGCGGGCCGGACTATTTTAAAT +TTTAATTTAAATTTTATAAATATAATATTTAATTATAAATTTAATAATAATATATAAAAA +ATATATATATGGTTAATATATATAAAGATTATAATCTTTTTATTAAATAAAGGAAAATTT +ATTATATAATTTTTCTCTATAGTTATATATTTAAAACTTATTTTTTTTTTTTTATAAATA +ATAATTATAATAAATAATATTAATTATTTATTATATAATTAATTGGCCCCCATGCTGGGT +TCCGGAACTCCTCCTTCTCGCGAGGTTAACACCTATTATATAACTATAACTATAACTATA +ACTATAATTATAATTATAACTATAACTATAAATATTCATTTTAATAATAATAATAATAAT +AATATTAATATAAATAGTCGAAGAATATATTTATTTATTTTAATATAAATAAAAAGTTTC +AATTAATTTGAATTTGGAATTAAATTATTACTTCATATGGGGTTATGGATTTCGTTCGGA +ACTCCTCCCTCCTACCTCTATTTATTAATCATAAATCATAAATTATTATTAATTAATAAT +AATAATTTACTCGAGGTTCATACCTATTTTAATATTAATATTAATATTGATAAAATATAT +ATTCACTAAAAAGTATATAATTTACTCAATTTATACTATAATTTTATATTTTTTTATTAT +AATTTAATTATTTCAAATAAAGTAATTATAATAATATATATCCTTTATTAAATATATATT +AATTAATATATATATAAAAAGTAAATATTATTAATTGTATATAATTATAAATAATTAATA +TTTATTAAAATATATATAATTTATAATCCTCATATAATTAATATAATAAATAATATAACA +CAATGTAATTTAATTTAATTACATAATAAATTTATTATTATTATAATTATTATTTATTTA +TTTATTTATTATTATAAATTATAAATATTATTATAATTAAAATCAATTATTAATTATTAA +TGATAAATAATTAATGATAAATTATCAATAACCAATTAGATTATTTATCGATATTTAATT +ATATTATATTATATTATATTATATATATATATATATATTATATTATAAAATTTATTTATA +AATATTTGTTTATTTATTTATTTATTGAATAACAATAGAATTAAATATTGTCAATAAATA +ATAAATAATGTTTAATATATATTATATTATATTAATATTAATATTATTATTATTTTTTTT +ATTATATTAATATAATTTATAAAAATATAAAATTATTATTTTTATTATAATTTATATATA +TATAATATATATATTTATTAAAATATTTTAAGAAAGGAGAAAAATAATTAAATTAAATTA +AATTAAATTATTTATTATTATTATTATTATTTATATAATAATATATTATTTAAATATTTA +TATATTATTTTTATATTAATATTTATAGATGGGGGGTCCCTATTATTATTGAAAATAATA +ATTATTAATGGACCCCAGATAGCTTCTTGTTTATCATTTATATATATATATATATTATTA +ATTATTTTATTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGGCCCCGGAACTTTATAAT +ATTATTATTAATTATTTAATTAATATTATAATCATATAATTTAATATTTTATTTAATTTT +ATTAAAATTTAATATATATATTTTTATTATTATTTAATTAATTTATAAATATAAAATATT +CTTAATATTAAAAATAAATAAATAATAAAGTTTATAAATCATATATTATAATTATTTATT +ATTTTTATATTATATTAATAAAATATTATTATTATAAAAAAAAATAGAAATTTTATAATA +TTTTTATATATTTTTAATTATTATTATTAATATTTATTAAAGGAAATATAAAAACCGAAG +GAATATTATAATTATAATTATAATTATTATTATATTTAATTTATTATTATAATAATAATT +ATAGTCTGCCCCCTCTTTATCTTTATTTTAAAGTTCCGGGGCCCGGCTACGGGAGCCGGA +ACCCCGAAAGGAGAAGGATATTTAATAATTTATAATATTTAATTCATATATATATATATA +TATTTTATTTTTTATATATATATTTAATATATTATATTTATATTTATATTATTATTATAT +TTATATTATATTATTTAATTATTTTTTAATAATATATTATTAATATTTTACCTTTTGATA +AATAAAAATTTATTAAAAATTTTATAATAAGTATTAAAATATCATAAAAGTATAATATTT +ATATAAAATGTATAAATTTATAATCTTCTAATTAAATTAAATTAAATAAATAAAATAAAA +TAAATTAAACTCCTTTTGAGATTCACACCTATTTTATTAAAAATAGGTATTCACTTAATT +AAATTAAATTAAATTAAATTAAATTATGGATAATTTATTTAATAAATATATATATTAATT +ATAAAATAATAGTCCGGCCCGCCCCGCGGGGCGGACCCCGAAAGAGTCTGCCCCTTTTTA +TTTAATATTTAATATTTAATATTTAATATTTAATATTTAATATTTAAAGAAGGATATATT +TATAATTTATCATAATATTATTTAATAAGAAATTATTAATTAATTAATTAATTAATTTAT +TTATTGTTTATATTTATTAATATTAATATAATAAAAATGTAAAATACTTAATATTATTAA +TATTATTATATATAATATATATAATAATATATTATATTTATATCTCCTTTATTCCTTTTT +CCCCCGATGGGGACTTATTATATTATATTATTATATATTTCTTCGATAACTTTATATATA +TTTTATTTTTATAAAAAAATATTTATATATTATTATTTACAATAATAATTATTAATAGTC +CGGCCCGTCCCGCGGGGGGGAACCGAAGGAGTGCGGGACCCCGTGGGAACCGCATCCCTT +TTTATTTTTAATTAAGAAGGAGTGAGGGACCCCGTGGGGACCGAACCCCGAAGGAGTCTT +TTTTCTATTTATTAATAATAACTATAAATTATATTTAAAATAATAATTTACTTGTTATAA +TCTTAATGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGAAGTATATAAAT +ATTTACTTGTTATAATTTATTATATATTTATAACCTCCTTCTTAAAATTATCTTTACTTT +ATAATAAAAATTAATATAATATAATCTGATAATAATCGAATTTTATTATATTTAATTTAA +TTAATAATAGACAATTATTATTATTATTTTACTTATTAATATTAATTTAGATTTATATAT +ATAAATATTATTAATTTTATATTAATTTTTTATTAATTATTTATTTTTATATTCATATTT +TTTATTAATATTATTTTTATTAATAACTTTTTAAATAATTATAAACTATATATTATTTAT +ATTTATATTTATAATAAATGAAACAATTATAATAAAAATTACAATTACAATTATATTATA +ATTATGATTACAATAGGGTTAAACATTACCTGTGAACAACTGGTAATGTTTAACCCGTAT +TATTATTTATTATATTATATATATATTAAAATATTAATATTAATATTAATATTATATTAT +ATTATATTATATTATATTATATTATATTATATTATATTTATAATTATATTATATTATATA +ATTTATATACTTTTATAATTCTTATTATTATTTATTTATTATTTATTTATTATTATTTAA +ATATATTATTATTATATATTAATAATATATATATTATTTTATATATTTTATTTAATATAA +ATTATTTATATTTTTATATTTTATTATGAGGGGGGGTCCCAATTATTATTTTCAATAATA +ATTTATCATGGGACCCGGATATCTTCTTGTTTATCATTTATTATTCTTATTATTTGGTTT +TTATTTAATATTTATAATTTATTTTATACAATTTATTATATTGTTTATACCTTATTATTA +TTATATAATATATTATATTATTATAATAATTTAATTAATTATATTATTAAATATTAACTA +ATGTGTGCTCTATATATATTATTCATTCTAGTTTCTAATCACCCACCCCCTCCCCCTATT +ACTTATATATCTAGAAATAAAAATACATAACATATATTTTAAATATATATATATAATTAT +ATAATAATTATTATATATAAAATATATATATATATAATATATATTTATAAAATAATAATA +ATAAATATTATTACTCCATTAGAGGTTTTGGTCCCATATCAGGAACCGAAACTATAATAA +TATATAATATTATAATAAAGATATTCTTATTTATAATATATTATTAAATAAATTAATAAT +AATTATAATATATATATATAATATATTATAATATATTTATTCGAGAACTTTTTATTTATT +ATAAAATAAAATATTTTATTTATTATTTAGTTTTTTTTTATTAAACATTTTATAAAAATA +TAAATGTTAATAATATTATGATTAATAAGTAATAATAAATTTATTTATTTTTATTAATTA +CTTCTTCGAGGTATTAGTATCAGTATCAGTATCAGTATCGTAAAAAACGGGTGACTAAAA +TATATATATATATAAAATTATAAATAAAAATATTATAATAATTTTAAATAAATAAATATC +AATATATTATTATTATTTATATTATAATAAATATTATCTAATAATAGTCCGGCCCGCCCC +CGCGGGGCGGACCCCGAAGGAGTCCGAACCCCTTTTTTATTTAATTTTATTTAAAGAAGG +AGTGAGGGACCCCTCCCGTTAGGGAGGGGGACCGAACCCCGAAGGAGATAATTAGATATA +ATTATATTTTATTTTATATAAATTATATAATATTATATAATAATAATTATATAATAAGTT +AATAATAATTATATAATAAGTTAATAATAATCATATCTCCTTTATAAATGAACTTTTATT +AAATATATTTTATTAAATATTAAATATATTTTTTATAATATTAAATATATTTTATTAAAA +TATTTAATATATTTTATTAAATATTAAATATATTTTATTAAATATTAAATATAAATAAAG +GTTTATATTATAATTCATTATTTATATCTTCTTTATAAATTAATATTCGTATTAGATCCT +TATTTAATTTATAATCCTTTAAAAAACTTTTAATAAATATAATATAATATATATATAATT +TTTATTATTTTTATATTATTTTTATTATTTAAATATATTATATATTTCATTATAATAATT +ATTTAAAAAGTTATTTAATAAATAATCTGATATTATATTTTATAATTAATTTTATTTATT +TTATTTATTATATATATTATTATATATAATTAAAATTATAATTACAATTATAACTATAAT +TAAATTAAATTAAATTAAATTGGATTAAATTAAATTAAATTGGGCGCCAAGCCGGTTGTT +CACCGACTTGGTCCCAATATAATATGAGATAATATAATATACTATATGATATAACATAAA +TATAATATATTATATGATATAACATAAATATAATATACTCCTTCGGGGTCCGCCCCCGCG +TGGGCGGACCGGACTATATGAATATATTATTATTATAATTATAAAATTATAATAAATAAA +TAAAATTTCTTTAATAATTATTAATTAATATTATTAATTTATTTACAAATATTTTATTAA +TTTTTATTTTTATTAAATATAAATATATAAATATATATATATTTATTTATAATATTATTT +ATATTTATTATATATTATTATTAAATATATTTTTATTATATATCATTAAATATTAATATG +TTATTATAGTGGTGGGGGTCCCAATTATTATTTTCAATAATAATTATTATTGGGACCCCG +GATATCTTCTTGTTAATCAATTATTATATTATTTAATTTATTTTATTTCTTATTTATAAT +TTATATTATATAATTTATTATATTGTTAATACTCCTTCGGGGTCCCCGCCGGGGCGGGGA +CTTTTATTTATATTATTAATTATATTATATTATTATAATATATTTAATTGATTATATTAT +AAAATTATAACTAATGTATGCTTTGTATTTATTGAATAGTTTGGTTCTTATCACCCACCC +CCTCCCCCTATTACTTCTCCGAGGTCCCGGTTTCGTAAGAAACCGGGACTTATATATTTG +GTAATTAAAAATATAACTTATATAAATATTTAATAAATATATATTAAATATATTATTATT +AATAATTTATTATTATATAAAAAAATAATAAATATTATTAATGATTTAAATTATATAAAT +ATTAATTATTAAATAAATAATTATACTTTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGG +GCCCCGGAACTTTAAAATAATATATATATATATAAAAGTATTTTATAATAATTAGTTTAA +TTATTATTCTTTTTTTTTATTAAATATAAAATCATTTTAGGTTATTAATTTTTATTTATT +AAAAATAAATTTTATAATTAATATTTCTCCTTTCTTAAAATAAATAATATTATTATTATA +ATTATTAATTAATGAATACTCTTCTCTTTTGGGGTTCGGTCCACCCTCCCGTATACTTAC +GGGAGGGGGGTCCCTCACTCCTTTTGAGACTTTAATTTTATAAATATAAATATAAATATA +ATAAGATGTTAACTCTTTTATAAATAAATAATAAATATAATTCTATTTTTAATAATAATA +TATAATATTTTTATAATAAAATATATAAATAATAATATTTATATATATATATATACTTTT +TTTTATATAAGAATAATATATATAGTTCACATTGGAGGCGAGTAAAAGGAGATAAGAAAT +ATAATATAATATAATAATAAAAATATAATGAATAATAATAATAAAAATTTATATAATAAC +AAAATAGTCCGACCGAAGGAGATGAGATTATTAATATTATTAAATAATAAAATGTATTAA +TTATAAAATATAAAACCTATAAATAATTTATAATATAATTTATATTATGATAATAATAAT +ATATATATTATAATATTTTATATATATATTTATTATATTTATATTTATATAAAAAAGTGA +TATTGATTAATTAATTAATTTATAATTAATAATTATTAATATAGTCCGGCCCGCCCCCGC +GGGGCGGACCCCGAAGGAGTCCGGCCGAAGGAGTTTATTATATTATATTAAATAAGATTT +ATAATATAATTAATATATATTTTAATAAATATAAAAGATTATATTATATTATAAAAAGTA +TATTTTATATTTATATTTTATTTATTATTATTATTATATATATAAGTAGTAAAAAGTAGA +ATAATAGATTTGAAATATTTATTATATAGATTTAAAGAGATAATCATGGAGTATAATAAT +TAAATTTAATAAATTTAATATAACTATTAATAGAATTAGGTTACTAATAAATTAATAACA +ATTAATTTTAAAACCTAAAGGTAAACCTTTATATTAATAATGTTATTTTTTATTATTTTT +ATAATAAGAATAATTATTAATAATAATAAACTAAGTGAACTGAAACATCTAAGTAACTTA +AGGATAAGAAATCAACAGAGATATTATGAGTATTGGTGAGAGAAAATAATAAAGGTCTAA +TAAGTATTATGTGAAAAAAATGTAAGAAAATAGGATAACAAATTCTAAGACTAAATACTA +TTAATAAGTATAGTAAGTACCGTAAGGGAAAGTATGAAAATGATTATTTTATAAGCAATC +ATGAATATATTATATTATATTAATGATGTACCTTTTGTATAATGGGTCAGCAAGTAATTA +ATATTAGTAAAACAATAAGTTATAAATAAATAGAATAATATATATATATAAAAAAATATA +TTAAAATATTTAATTAATATTAATTGACCCGAAAGCAAACGATCTAACTATGATAAGATG +GATAAACGATCGAACAGGTTGATGTTGCAATATCATCTGATTAATTGTGGTTAGTAGTGA +AAGACAAATCTGGTTTGCAGATAGCTGGTTTTCTATGAAATATATGTAAGTATAGCCTTT +ATAAATAATAATTATTATATAATATTATATTAATATTATATAAAGAATGGTACAGCAATT +AATATATATTAGGGAACTATTAAAGTTTTATTAATAATATTAAATCTCGAAATATTTAAT +TATATATAATAAAGAGTCAGATTATGTGCGATAAGGTAAATAATCTAAAGGGAAACAGCC +CAGATTAAGATATAAAGTTCCTAATAAATAATAAGTGAAATAAATATTAAAATATTATAA +TATAATCAGTTAATGGGTTTGACAATAACCATTTTTTAATGAACATGTAACAATGCACTG +ATTTATAATAAATAAAAAAAAATAATATTTAAAATCAAATATATATATATTTGTTAATAG +ATAATATACGGATCTTAATAATAAGAATTATTTAATTCCTAATATGGAATATTATATTTT +TATAATAAAAATATAAATACTGAATATCTAAATATTATTATTACTTTTTTTTTAATAATA +ATAATATGGTAATAGAACATTTAATGATAATATATATTAGTTATTAATTAATATATGTAT +TAATTAAATAGAGAATGCTGACATGAGTAACGAAAAAAAGGTATAAACCTTTTCACCTAA +AACATAAGGTTTAACTATAAAAGTACGGCCCCTAATTAAATTAATAAGAATATAAATATA +TTTAAGATGGGATAATCTATATTAATAAAAATTTATCTTAAAATATATATATTATTAATA +ATTATATTAATTAATTAATAATATATATAATTATATTATATATTATATATTTTTTATATA +ATATAAACTAATAAAGATCAGGAAATAATTAATGTATACCGTAATGTAGACCGACTCAGG +TATGTAAGTAGAGAATATGAAGGTGAATTAGATAATTAAAGGGAAGGAACTCGGCAAAGA +TAGCTCATAAGTTAGTCAATAAAGAGTAATAAGAACAAAGTTGTACAACTGTTTACTAAA +AACACCGCACTTTGCAGAAACGATAAGTTTAAGTATAAGGTGTGAACTCTGCTCCATGCT +TAATATATAAATAAAATTATTTAACGATAATTTAATTAAATTTAGGTAAATAGCAGCCTT +ATTATGAGGGTTATAATGTAGCGAAATTCCTTGGCCTATAATTGAGGTCCCGCATGAATG +ACGTAATGATACAACAACTGTCTCCCCTTTAAGCTAAGTGAAATTGAAATCGTAGTGAAG +ATGCTATGTACCTTCAGCAAGACGGAAAGACCCTATGCAGCTTTACTGTAATTAGATAGA +TCGAATTATTGTTTATTATATTCAGCATATTAAGTAATCCTATTATTAGGTAATCGTTTA +GATATTAATGAGATACTTATTATAATATAATGATAATTCTAATCTTATAAATAATTATTA +TTATTATTATTAATAATAATAATATGCTTTCAAGCATAGTGATAAAACATATTTATATGA +TAATCACTTTACTTAATAGATATAATTCTTAAGTAATATATAATATATATTTTATATATA +TTATATATAATATAAGAGACAATCTCTAATTGGTAGTTTTGATGGGGCGTCATTATCAGC +AAAAGTATCTGAATAAGTCCATAAATAAATATATAAAATTATTGAATAAAAAAAAAATAA +TATATATTATATATATTAATTATAAATTGAAATATGTTTATATAAATTTATATTTATTGA +ATATATTTTAGTAATAGATAAAAATATGTACAGTAAAATTGTAAGGAAAACAATAATAAC +TTTCTCCTCTCTCGGTGGGGGTTCACACCTATTTTTAATAGGTGTGAACCCCTCTTCGGG +GTTCCGGTTCCCTTTCGGGTCCCGGAACTTAAATAAAAATGGAAAGAATTAAATTAATAT +AATGGTATAACTGTGCGATAATTGTAACACAAACGAGTGAAACAAGTACGTAAGTATGGC +ATAATGAACAAATAACACTGATTGTAAAGGTTATTGATAACGAATAAAAGTTACGCTAGG +GATAATTTACCCCCTTGTCCCATTATATTGAAAAATATAATTATTCAATTAATTATTTAA +TTGAAGTAAATTGGGTGAATTGCTTAGATATCCATATAGATAAAAATAATGGACAATAAG +CAGCGAAGCTTATAACAACTTTCATATATGTATATATACGGTTATAAGAACGTTCAACGA +CTAGATGATGAGTGGAGTTAACAATAATTCATCCACGAGCGCCCAATGTCGAATAAATAA +AATATTAAATAAATATCAAAGGATATATAAAGATTTTTAATAAATCAAAAAATAAAATAA +AATGAAAAATATTAAAAAAAATCAAGTAATAAATTTAGGACCTAATTCTAAATTATTAAA +AGAATATAAATCACAATTAATTGAATTAAATATTGAACAATTTGAAGCAGGTATTGGTTT +AATTTTAGGAGATGCTTATATTCGTAGTCGTGATGAAGGTAAACTATATTGTATGCAATT +TGAGTGAAAAAATAAGGCATACATGGATCATGTATGTTTATTATATGATCAATGAGTATT +ATCACCTCCTCATAAAAAAGAAAGAGTTAATCATTTAGGTAATTTAGTAATTACCTGAGG +AGCTCAAACTTTTAAACATCAAGCTTTTAATAAATTAGCTAACTTATTTATTGTAAATAA +TAAAAAACTTATTCCTAATAATTTAGTTGAAAATTATTTAACACCTATAAGTTTAGCATA +TTGATTTATAGATGATGGAGGTAAATGAGATTATAATAAAAATTCTCTTAATAAAAGTAT +TGTATTAAATACACAAAGTTTTACTTTTGAAGAAGTAGAATATTTAGTTAAAGGTTTAAG +AAATAAATTTCAATTAAATTGTTATGTTAAAATTAATAAAAATAAACCAATTATTTATAT +TGATTCTATAAGTTATTTAATTTTTTATAATTTAATTAAACCTTATTTAATTCCTCAAAT +GATATATAAATTACCTAATACTATTTCATCCGAAACTTTTTTAAAATAATATTCTTATTT +TTATTTTATGATATATTTCATAAATATTTATTTATATTAAATTTTATTTGATAATGATAT +AGTCTGAACAATATAGTAATATATTGAAGTAATTATTTAAATGTAATTACGATAACAAAA +AATTTGAACAGGGTAATATAGCGAAAGAGTAGATATTGTAAGCTATGTTTGCCACCTCGA +TGTCGACTCAACATTTCCTCTTGGTTGTAAAAGCTAAGAAGGGTTTGACTGTTCGTCAAT +TAAAATGTTACGTGAGTTGGGTTAAATACGATGTGAATCAGTATGGTTCCTATCTGCTGA +AGGAAATATTATCAAATTAAATCTCATTATTAGTACGCAAGGACCATAATGAATCAACCC +ATGGTGTATCTATTGATAATAATATAATATATTTAATAAAAATAATACTTTATTAATATA +TTATCTATATTAGTTTATATTTTAATTATATATTATCATAGTAGATAAGCTAAGTTGATA +ATAAATAAATATTGAATACATATTAAATATGAAGTTGTTTTAATAAGATAATTAATCTGA +TAATTTTATACTAAAATTAATAATTATAGGTTTTATATATTATTTATAAATAAATATATT +ATAATAATAATAATTATTATTATTAATAAAAAATATTAATTATAATATTAATAAAATACT +AATTTATCAGTTATCTATATAATATCTAATCTATTATTCTATATACTTATTACTCCTTTT +TAATTAAATTAAAAAGGGGTTCGGTTCCCCCCCCCCATAAGTATGATTATAATTATAATT +ATAATATAAGGGAGGGGTCCCTCACTCCTTATGGGGTCCCGGTTGGACCGAGACTCCTCC +CTTGCGGGATTGGTTCACACCTTTATAAATAAATAATAAATAATAAATAAAGGTGTTCAC +TAATAAATATATATATATATATATATATATTATATTATAATATTATTTAATACTTAATAT +ATTATATATTTTATATTTAATAAATAAAAAAAATATTAATAAATAATAATATTAATAATA +AAGAAATTATAATTAATACCCTCTATATATAATTCTAATTAATTAAATTAAATATTTATA +TATAATAATCAATATATTATTAATTTAATAATTATTATAATAGTTCCGGGGCCCGGCCAC +GGGAGCCGGAACCCCGAAAGGAGTTTATAAAAGATATATTTTTATATTATATTATATTAT +ATTTAATAAATATTACCTTTTTTTATTATTATTTTTATATATTATATAATATTATTAATT +TTTATTATAATATTATTTACTTTTTTATTGGATTATTTATTTATTTATTTATTTATTAAT +TAATTAATTAAATATTTATTAATTAATATATATATTAAATATTAATATTTCATTAAAAAA +AAGAGATATATGAATAATATATTATGTTATATTATATTATATAATTATATTATTTTTATA +ATATTAATAATTAAAAATAAGAACTTATTTAAAAATTATAATTATGATAATAAATTAATA +CTTTTTAATTTATAAAAATATAAATTTCTTTACATATATATATATATATATTATTATTAT +TTATATTAATCATAATTTTAATATTTATAATAAATTTATATAAAATCAATTATAATATTA +TATACTTTTTATATACTTTATAATCTTTATATCTTCACCCCCCCTTTTTTAATAATATAT +TATATTAAAAATATAATAATTTATATGATTTATTAATACTTTTTATATAATTATATTATT +ATTTTTTTTTATAGATGTTATATTATTTTTTATAATAATTTTTTTTTATTTAAATAAAAT +TTATAACTCCTTCTTAATTAAAGATAAAAGGGGTTCCCCCCTTAAGTATAAGTATAAGTA +TAAGTATAAGTATAAGTATAAGTATAAGTATAAGTATAAGTATAAGTATAGTATACGGGG +GGGGGGTCCCTCACTCCTTCGTTAATTTATATATATTATTAATAATTATTTAATTTTTAT +TATTTATTTATATATAAAAATATTCTAAAATTATTAATATTTATAATAGAATAAATATTA +TAAAGTATAATTATAAATAATTAATTATTTAAATAATAATAATATATTTATTATTATATA +ATAAATATATTATAAATAATAGTTATATTAGCTTAATTGGTAGAGCATTCGTTTTGTAAT +CGAAAGGTTTGGGGTTCAAATCCCTAATATAACAATAATAATAATAAAATATTAAAATAA +ATATAATATTTATAAAAAATTTATTAATTTATATAAAAAATATATATATAAATAATAATT +ATAATAAAACATTTTATAATCAATAATTTAATAAATAATCTTCTTATTATAATATTATGT +TTAAATATTACTCTTTATGAGGTCCAACAAACTAATAAGATATAAATATATATATATTAT +ATAATAATAATAATAATAATATATTATTTAATATATTATCAAGAAGATAAATATAAATAA +TATTTTAATAATTTTAAATAAATCTAATTTATATATTAATAATTTAATAATCTTAATATT +TATTATCATTATTTCATATTTATATTATATAAATATTTATTTAAATAAAAAATATTAAAG +AGTTTATTTTATTTATTATAAATTATTTAATAAAATATATATAATAATATATAGAATAAA +GATATAAATAATTATAAGTATATAAAGTAATAAAGGAGATGTTGTTTTAAGGTTAAACTA +TTAGATTGCAAATCTACTTATTAAGAGTTCGATTCTCTTCATCTCTTAAATAAATAATAT +AATAATAAAATATTATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGGAAGGAGAT +AAATATATATATATTTATAATAATTATATAATAAAGGTGAATATATTTCAATGGTAGAAA +ATACGCTTGTGGTGCGTTAAATCTGAGTTCGATTCTCAGTATTCACCCTATAAATAATAA +TAATAATATATTTTATTATTCTTAAATTTTTTATTCTTTATATTATATATATAATATTAA +TATTATTACTTTTTAATAACAAAATATTATAATTAATTGATATATATATATACCAAATAT +AATTAATTGAAATTAAATAATAAATAAAATATTTACTTCTTTATTAAAATTCTAATTAAT +TGATTCTTTTTATTGAATATTAAATTCTATTATAACTTATTAATTAATTAATTAATTAAT +TATAATAATAATAATATTTATTATTAATTATTAAATATTTATTATTATATATAAGATTTA +ATTTTAAATATTAATAAAAAAAGAATAAAATAAAATAAAATGAATAATATTTCTTTATCT +CTTTCGATCGGACTCCTTCGGCCGGACTCCTTCGGGGTCCGCCCCGCGGGGCGGGCCGGA +CTATTTATTATTATAATATAATATTTAATCAATAGATTTATAATTTATTTAATGAATATT +TTATAAATATATAAAACAATTCCTTTTTATTATTATAAATTTTTCATTATTTATTTATTT +ATTTATTTATTTATTCAATATATAAAAATAATTATAAAAAGATTATTAAAAATAATAATT +TAATGATAAATATATATTATATATATTAATATAAAAATAATAAATATAAATATATTATGT +AAATATTATATAAATTTGTATATGTATATATTATAATAATGTTATATAAGTAATAATATA +ATAAAATATTTTATGTAATTTATATATATTTATAATTATAAAATAAAAATATTATAAATA +ATAAAATTAATAATAATAATAATTTTAATAAAATAAATTATATATTTAATTTTATTATGA +AGTTTATACTTAATATAAATTATATTTCCTTTATAAATTATTAATATATCCTTTTTAATT +AAATAAAATAAAAATATTATAAATATTAATAATTAATTTTTTATTTATATTTATATATAT +ATTAAAGATTAAATATATTATTAATACTAATTTATAATTTATTATTAATAAATAGTCCGG +CCCGCCCCCTGCGGGGCGGACCCCGAAGGAGTTCGACTTAAATTATAATTTAATAATTTT +TATTTATTAATAGTTTCGGGGCCCGGCCACGGGAGTCGGAACCCCGAAAGGAGTTTTATT +ATTAATATAAAAAGAGTAAGGATAATAATAAATTCTTTTAATTTATTTTTAATAAAATAT +AATTTTAAAATAGTTTTTATAGTCCGGCCCGCCCCGCGGGGGGGGGCGGACCCCGAAGGA +GTTCGGTCTGGCATTAATTATAATAATTATATTAATATTATTATTATTTATTATATTATA +ATATATTTATTTATATTTTATAATATTAATAATTATTTTATATTTAATAAATATAATATA +TATATTATTTTTTTTAATAACTATCTAATTAATAGCTATTTTGGTGGAATTGGTAGACAC +GATACTCTTAAGATGTATTACTTTACAGTATGAAGGTTCAAGTCCTTTAAATAGCAATAA +ATATATATAATATATATAATATATATAAATGAGTCGTAGACTAATAGGTAAGTTACCAAA +ATTTGAGTTTGGAGTTTGTTTGTTCGAATCAAACCGATTCAATATTATAATATATATATT +ATTTATATATAAATATATAATTATACTCCTATTTTTATATTAATTAATTAATAATATATG +ATAATATAAAAATTATTGAATTATTAACTCTTATTAATAATAATAATAATCATAATAATA +ATATATATATATATAGTATATATATAAAAGTTTTATTATATTATATTATATTATATATTT +ATTTATATATAATTCTTATTAATTGAAAAAAGAATAATTAATAATCTTATTAAAAAAATA +AATACTTTCATTTTATTTTATTTTATTTAATTTAATTATAATATATAAATATTAAAAAAA +GGATATAAGTTTTTTATAAGATATAATATATATATATATTAAATATAAAGAAGTTAATAT +TTATATTTTAATTATAAAATGTTAATACTCCTTTGGGGACTTATTAATTAAATTATTAAT +TAATAATAATTTATGATTTATAAATAATAAATAAAGGAATAAGTATCAATTAATTAATAT +ATTATATTTAATATTTTATATTTAATATTTAATATTTAATATTTTAAGTTCCGGGGCCCG +GCCACGGGAGCCGGAACCCCGAAAGGAGTAGTATTAATTATGGATAGTGAGGGTGGATTT +AATCCTTTTGTTATGTTATTAATTAATTAATTAATTTATATATATAAAATATTTTAATTA +ATTTTTATATAAATATATATATATATATATATTAATAATAGTCCGGCCCGCCCCGTGGGG +CGGACCCCAAAGGAGTAATATATATTATGTATAAACAATAGAGAATATTGTTTAATGGTA +AAACAGTTGTCTTTTAAGCAACCCATGCTTGGTTCAACTCCAGCTATTCTCATAATATTA +TATATATATATTTCCCTTTCTAAAAATAATAATAATTATATATAATAATAATATAATTAT +ATATATATATATTATAATAATAATAATAATAATAATAATAAATAATAATAATTATTTTTA +TTAATAATATTAATATATTATAATTATTAATAAATATTAATAAAAATAGCTCTCTTAGCT +TAATGGTTAAAGCATAATACTTCTAATATTAATATTCCATGTTCAAATCATGGAGAGAGT +AATTATATTATATTAATAATCCCCCCCCCATTTTTAATTAAATTAAGAAGTTTAATTTAC +TATTTAATAATAAATGAAATAATAATAATAGATATAAGTTAATTGGTAAACTGGATGTCT +TCCAAACATTGAATGCGAGTTCGATTCTCGCTATCTATAATTAATATTAATATAAATTAA +TATCCTATAATTAATTAAATACAAAATTATATTAAAACTTATATTATATTATATTATAAT +ATTATATTATTATTATATAAAAATATAATAATAATAATATTTAATTTTATTTAATAATAA +TATTTTATATAATAAAATAATCATATTTATAATATTTAATATTAATAATAATTTATTATA +ATAATTCTTTAATATACTTATTTATTATTATTTTAATAAATAAATATAATTCTTATAAAT +ATATTATAACAAAATATATTATATTTTAATTAAATACAATATTATAAATATATATATATA +TATAAATATTTATATAAAAAAAAAATAAAAATATTTTAATAATTATTCTTTATAAATAAA +TAATGATAATAATAATTTATAATAATCTCCTTGTGGGGTTCCGGCTCCCGTGGCCGGGCC +CCGGAACTATAATATATTTTAATATATTTTTTATTACTCCTCCTTTGGGGTCCGCCCCGC +GGGGGCGGGGCGGACTATAATAATTTTTTATTGATAAAAAAGTATATATAATATAATTAA +TATATTTCTTTTTATATAAATTATAAATATTATTTTATAATAAAAAAAGTATATATAATA +TTATATATTTAATAAATAATATAATAATAATATAAATAAATATATATATATTATTAATAT +ATTAAATTTTATAATAATAATTATAATAATAGTAGTAGGTATAAATTTTAATAAAGAGTT +TTATTCCAATGGAGTAATAATAATAATAATAATAAAATAAAGGATCTGTAGCTTAATAGT +AAAGTACCATTTTGTCATAATGGAGGATGTCAGTGCAAATCTGATTAGATTCGTATATTT +ATACTTAATATAAAAAAAATAAATAATAATCTTTTTTATTATTATATTTATTAATAATAA +ATTATTTTGTTATTATTATTAATTTATATTAATATTTTATATAAATTATTTATTTAATCT +TTCATTATATATTTAATATATTATTAATATTAATTAATATTTTATAATAAATAAATAAAA +TAAAATAAATATTTTAATATAATACTCCTTCGGGGTTCGGTCCCCCTCCCATTAGTATAG +TATAGGGAGGGGTCCCTCACTCCTTCGGGGTCCCCGCCGGGGCGGGGACTTATTTTTATA +TTTATTAATAATAATTAATTTTTATATAAATTTATTATTTCTTACAATATATTTATTACT +ATTATTTTTTAATAATCTTATATATAATATATAAAATATATATATATTATATATATATAT +AAATATAATATATATTATTATAAATATTTATAATCTTATTAATTAATTAGATTATATTAT +ATTATATTAGATCATATTATATTATATTATATTATATTATATTATTATTATTAATATTTT +TATTTTTATTTTATATTTAATAGTAAAAAATCATAATTTTATAATTTATTAATTATTATA +TAATTTCATTAATATATTTCTTCTTTTTATTTATTTATTTATTACTTATTAATAGTTCCG +GGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAAAATAATATAAAAAATAATTATAATT +TATTATAATTTATTAATTTATTAATTTATTAATTTATTTATTAATTTATTAATTTATTTA +TTATTATATTTTTTTTAATAAAGGAAAATTAACTATAGGTAAAGTGGATTATTTGCTAAG +TAATTGAATTGTAAATTCTTATGAGTTCGAATCTCATATTTTCCGTATATATCTTTAATT +TAATGGTAAAATATTAGAATACGAATCTAATTATATAGGTTCAAATCCTATAAGATATTA +TATTATATTATATAATATTATATATTAATAAATATTATTAATTAATTTATTTATTTATTT +ATTATTAAATAAAAATATTTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAA +AGGAGAATAATATAAAATATTATAATTATTTATATATTAATTATTAATTATTTATTATTT +ATTATATAAAAAGTATATAATTTTATATTTTAATATAGGGTTAATTAATTAATTATTAAT +TTTTTATAATAAGATAATAATATATTAAAAACTTATTATAAATTTATAAAATAATATTTA +TTTACTTTGATATTATTTTTAATCTTTCATTAATATATATTTTATTATAAGTAATAATAT +AGTTTAATTTAATTAATATAAATAAATTACATAAGAATAATATTATAATAATATTATATA +TTATATAAAGAAATAATAATTTATATTTTTATTTTTTTTATAAATAATATAAATATAAAT +ATAATGGGGTTATAGTTAAATTTGGTAGAACGACTGCGTTGCATGCATTTAATATGAGTT +CAAGTCTCATTAACTCCAATAATTATATTATATAATATATATATTAATAAATTATATATA +TATATATATATATAAATATTAAATAAATATTATATTAATAAATAATATAAATTATCTAAT +CGAAGGAGATATTTATAATATAATATAAATATTTTAATAAATTAATAAATATTATATTAA +TAAATAATTAATAAATATATAAATTATAATAAATTTTAATATTATTATATAAATTAATTA +AATATAATAATTAATGAAATAGAAACTATAATTCAATTGGTTAGAATAGTATTTTGATAA +GGTACAAATATAGGTTCAATCCCTGTTAGTTTCATATTATATATCATTAATATATAAAAT +ATAAATATATATATTATAATAATAATAATAATAAATATAAATATAATTATATATATATAT +ATATATAAATAAATAATTATTTAATTTATAATAAATATATATAGTTCCCGCGAAGCGGGA +ACCCCATAAGGAGTTTTATTATTAATTATATTTAATAAATATTAATTATTAATTTTATAT +TTATAAATAAATTTATTACTCCTTCTTAATTAAGAATAAAAAGGGATGCGGTTCCCATGG +GGTCCCGCACTCCTTCGGGGTCCGCCCCCTCCCCTGCGGGAGGGGAGCGGACTATTTTAT +TAAAAATATTATAATTAAATAATAATATAAATAATTTATAATATAATAATATATACTTAT +AAATAATATTTAAATCTTATTATTAATTTATAAATCATAAATTATTATTAATAAATATCT +CTTTTAGATAAGATAAATTGAACTTATATTTATATTATATATATATAGATATAAATCTTA +AATAGAGTAAATATATTATAATAATTATATAAATATATATATATTATATTAAGATAATAA +TATATATATATATTAATATATAAGGAGGGATTTTCAATGTTGGTAGTTGGAGTTGAGCTG +TAAACTCAATGACTTAGGTCTTCATAGGTTCAATTCCTATTCCCTTCATAATTTATTATT +AATTATATATTATTATAAATCAAATCCATTGAAATTAATATAATCCAATGAATAATTAAT +TTAATACATAATTTAATATATAAAATTATATATATATATACTTTATAAAAAAAAAAATTA +TATAATAATTATATTAATATATTTATATATATAAATAAATAAATAAATAATAATAATTAT +AATTATAATTATAATTAATTAATTAATAAATAAATAATAATTTATATTATCTTTATAATA +TATATATATACTTTTATAAAAAAAATATATAAATAATTCTAAAATGTATATTTCTCCTTT +CGGGGTTCCGGCTCCCGTGGCCGGGCCCCGGAACTATTAATAAAATTAATAATAAAATAA +TTATTATCTGTATTTAATAAATTTAATTATAGAGTTATATTTCTATATATTTATATATTT +ATTTATTTATTCTCCTTCCGGAACTAATAAAATATATAAAATAAGGGTTTTTATTTATTT +AATTAATATATATTTATTCTTTTATATAATATGTCCTTATAGCTTATCGGTTAAAGCATC +TCACTGTTAATGAGAATAGATGGGTTCAATTCCTATTAAGGACGATAATAATATATATAT +ATTTTAATTTATATATCATATATATATATATATTAAAGAAAATAATATAAAAAGTATGTA +TTAATAATAATAATAAATAAATAATAATAAATAATTTTATTATATTATATTATATTATAT +TTATTGATATATTTATTGATATTTATTAATTTAAGATTATTCATTAAATATATAATTATT +AATAATTTAATATATTTTATAATTTTTATTATATTTTATGTAAGAAGAAACTATTTTATA +TATTATATATATATATATAATTTTTATAAAATGATAAATTTTATATTATAAATATTATTA +AAATATTTTTATAAATATTTAAATTATTTATAAAAAGGTATATAATAATAATTATTAATA +TTATATTATATTATATATTTATTTATATTATATATAATAATATATTTATATATATATTAA +TTAATAAATTAAATAAGTATCTATATTTTATATTATATTATATTATTTTATTTTATTAAT +TCCGGAAGGAGAATAAAAAGTATTCTAAAGAAATTATATATTTATTATTTTTATTAATAT +GTTATAAATTAATAAAAAATAAATATGTATATATAAATTATATTTATTATGTTTAATTAT +TTATAATTTATTATAATATATAGTATAAGATATCTTATTTATATTTATATATAATAAAGA +ATATTATTAAACTAACACCTATATTATATATATTATATTATATAATATTATATATATATT +AATTACTAAGAATAAATTTATAATTAGATAATATTTATATTTATTTATTTATTTAATTAA +CAAATATATTAATATTTTTAATTAATTAATAATACCTTTATATATATATATATATATATA +TTAATTTTAATTATATAATTATCTTTTTTATTAATAATTATAAATATATTATATATTTTA +TATAATAAGATTATAATTTTATAATTATTTTATTTTTTATTAAAAATTATTATTATTATA +ATTATTATATTATAATTATAAATTATTAAAGAATATATTTATTAATATTTTAATAATTAA +TATCTTTTATTTATATTTATAAAATAAGGTATAAATATTGATAATAAAGAGTAAATATTG +TATTAATTATAATAATAATTATAATTAAGGAGCTTGTATAGTTTAATTGGTTAAAACATT +TGTCTCATAAATAAATAATGTAAGGTTCAATTCCTTCTACAAGTAATAATGATTATAATA +TTTATATATATTAAAATAATATTAATAAATAATTACTCCTCCTAGCAGGATTCACATCTC +CTTCGGCCGGACTCCTTCGGGGTCCGCCCCGCGGGGGCGGGCCGGACTATTTTATTATTA +TTAAATAGATGTTCATTAAATAATTATAAATATAATTTATCTTTTAAATATATATATATA +ATATAATATTTAAATATATATTATAAATAAATAAATAAATAATTAATTAATAAAAACATA +TAATGTATATTTATCTATAAAAAATATTAATTAAATTAATATATTATTACAGTTCCGGGG +GCCGGCCACGGGAGCCGGAACCCCGAAGGAGATAAATAAATAAATAAATATAAATAATTC +TTCTTCTTTAAAATTAAATAAAATAAAATAAAAAGGGGGGCGGACTCCTTCGGGGTCCCG +CCCCCCTCCGCGGGGCGGACTATTTTATTTTTAAATATATATTATATTAATAATATAAAT +ATAAGTCCCCGCCCCGGCGGGGACCCCGAAGGAGTATAAATAAAAATTAATAATATATTA +TATATATATTATATTAATAATAATAATAATAATAATAATAATAAATAATAACTCCTTGCT +TCATACCTTTATAAATAAGGTAATCACTAATATATTATAATAATAAAAATTATATATATT +ATATATAATCTAAATATTATATATTTTAATAAATATTAATATATATGATATGAATATTAT +TAGTTTTTGGGAAGCGGGAATCCCGTAAGGAGTGAGGGACCCCTCCCTAACGGGAGGAGG +ACCGAAGGAGTTTTAGTATTTTTTTTTTTTTAATAAAATATATATTTATATGATTAATAA +TATTATATATATTATTTATAAAAATAATATATAATTTTAATTATTTTTAATAAAAAAAGG +TGGGGTTGATAATATAATATAATATTTTTTATTTTAATTTATAATATATAATAATAAATT +ATAAATAAATTTTAATTAAAAGTAGTATTAACATATTATAAATAGACAAAAGAGTCTAAA +GGTTAAGATTTATTAAAATGTTAGATTTATTAAGATTACAATTAACAACATTCATTATGA +ATGATGTACCAACACCTTATGCATGTTATTTTCAGGATTCAGCAACACCAAATCAAGAAG +GTATTTTAGAATTACATGATAATATTATGTTTTATTTATTAGTTATTTTAGGTTTAGTAT +CTTGAATGTTATATACAATTGTTATAACATATTCAAAAAATCCTATTGCATATAAATATA +TTAAACATGGACAAACTATTGAAGTTATTTGAACAATTTTTCCAGCTGTAATTTTATTAA +TTATTGCTTTTCCTTCATTTATTTTATTATATTTATGTGATGAAGTTATTTCACCAGCTA +TAACTATTAAAGCTATTGGATATCAATGATATTGAAAATATGAATATTCAGATTTTATTA +ATGATAGTGGTGAAACTGTTGAATTTGAATCATATGTTATTCCTGATGAATTATTAGAAG +AAGGTCAATTAAGATTATTAGATACTGATACTTCTATAGTTGTACCTGTAGATACACATA +TTAGATTCGTTGTAACAGCTGCTGATGTTATTCATGATTTTGCTATTCCAAGTTTAGGTA +TTAAAGTTGATGCTACTCCTGGTAGATTAAATCAAGTTTCTGCTTTAATTCAAAGAGAAG +GTGTCTTCTATGGAGCATGTTCTGAGTTGTGTGGGACAGGTCATGCAAATATGCCAATTA +AGATCGAAGCAGTATCATTACCTAAATTTTTGGAATGATTAAATGAACAATAATTAATAT +TTACTTATTATTAATATTTTTAATTATTAAAAATAATAATAATAATAATAATTATAATAA +TATTCTTAAATATAATAAAGATATAGATTTATATTCTATTCAATCACCTTATATTAAAAA +TATAAATATTATTAAAAGAGGTTATCATACTTCTTTAAATAATAAATTAATTATTGTTCA +AAAAGATAATAAAAATAATAATAAGAATAATTTAGAAATAGATAATTTTTATAAATGATT +AGTAGGATTTACAGATGGAGATGGTAGTTTTTATATTAAATTAAATGATAAAAAATATTT +AAGATTTTTTTATGGTTTTAGAATACATATTGATGATAAAGCATGTTTAGAAAAGATTAG +AAATATATTAAATATACCTTCTAATTTTGAAGAACTACTTAAAACAATTATATTAGTAAA +TTCACAAAAGAAATGGTTATATTCTAATATTGTAACTATTTTTGATAAGTATCCTTGTTT +AACAATTAAATATTATAGTTATTATAAATGAAAAATAGCTATAATTAATAATTTAAATGG +TATATCTTATAATAATAAAGATTTATTAAATATTAAAAATACAATTAATAATTATGAAGT +TATACCTAATTTAAAAATTCCATATGATAAAATAAATGATTATTGAATTTTAGGTTTTAT +TGAAGCTGAAGGTTCATTTGATCTATCTCCAAAACGTAATATTTGTGGTTTTAATGTTTC +ACAACATAAACGTAGTATTAATACATTAAAAGCTATTAAATCTTATGTATTAAATAATTG +AAAACCAATTGATAATACACCATTATTAATTAAAAATAAATTATTAAAAGATTGAGATTC +ATCTATTAAATTAACTAAACCTGATAAAAATGGAGTTATTAAATTAGAATTTAATAGAAT +AGATTTTTTATATTATGTTATTTTACCTAAATTATATTCATTAAAATGATATAGTCGTAA +AGAAATTGATTTCCAATTATGAAAAACACTTATAGAAATCTATATAAAAGGTTTACATAA +TACACTTAAAGGTTCTAATTTATTAAAATTAATTAATAATAATATTAATAAAAAAAGATA +TTATTCTAATTATAATATTTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGGCCCCGGAA +CTAAAAATATTATTGATGATGTATTAAATATAAATCTTATCTATAATTATAAATTACCAT +ATCGTATAAATAGTGATATTCAACGTTTAAATTCTATAAATAATAATAATACTAAATTTA +TTAATGTTGGAGTATTTGTTTATGATTTAAATAATACATTAATTATAACATTTACTGGTT +ATAGACCAGCAGCTCTTTACTTTAATTGTTCTCCTTTTCGGGGTCCCGACTGGGGCCGGG +ACTAAACATGAAATTGCTAAATATATTAAAAATGGTAATGTATTTATAAATAAATATATT +TTAAAAAATATTTTATTAGATTAATTATTATTTTTACTTCTTCTTAAAATTAAAAAAGGA +GACTTTTTTATATTTATATAAATTATATATAAATTATTCTTTTATTATAAATATATAAAA +TTATTTTCTTTTAATTATTTTTATAATTAATTAATTCTTCATGGCTATAGCCATAACTTT +TAATAATATTCTTTTATTCTTTATTATTATATATATATATATTTATTATTTATTATTATA +GAATTTATATTTATAAAAATATTAATATTTTATTTAAAATAAATAATGATTAATTTATAA +AATATATATTAATTAAGTTTCGGGTCCCGGCTACGGGACCCGGAACCCCCGAGAGGAGTT +ATTATATTTATAATTAAATCTTTAAATAATATATCTTAAATTATTATATTGATATTAATA +TTATATTGATATTAATATTAAATATATATTTAATATTTAGCTTATTATTTTATAAAATTA +TATTTATATATTATAATATAATTAAATATATTATAAATTTAATAATTTAATAAAAATATT +CTTTTTATAATTATTATAATAATTAAATAAATAATAATAATAAGAATAATTAATGTATAA +TTTTTTTATAAATATTATATATTTTTATATTAATAGTTCCGGGGCCCGGCCACGGGAGCC +GGAACCCCGAAAGGAGAAATATTAATAAAATAAAATAAAATTATAATATAATTAAATTAT +AAGAATTATATTTACTCCTTTTATAATTTATATTTATAATATAATATAATATAAAATAAA +TATAATATAATATAAAATAAATATAATGTAATAGGTATTCACTCCTCTTTGGGGTTCCGA +TCCCCCATACGGATACGGATACGGATACGAATACGGATACGGATACGGATACGGGGGGCC +GTCCCCCAGAACTTAATATTATATCTTAAATAATTAATATAAATATAATATATTATTTAA +TAATAATAATAAATAAATAAATAAATAAATAAATAAATTAAATAAATAATAATATTATTA +TAATTACTTTTTAATAAATAATATTAATATAATATTATATTAGTATTATAAATAGACTTT +TTATTATTTTATATATAATATAGTCCGGCCCGCCCCCGCGGGGCGGACCCCGAAGGAGTA +ATATATTATATAATTATTATTTTTAATTATAAATAAAATATAATTATTATTTATTATATA +ATTTATATAAATATATATATATATTTATTATATATATAAATATAAATATAAATATAATAA +TTAATAATATTAAAGTTTTATATATATTAATATATTATAAAAGGTTTATATATATATATA +ATAAGATAAGTAATAAATTAATTAATTAATAATATAAAAATATATATTATATATTATGTT +TTATTTATATATATATATATATTATGTATTATTATATAAATATATATATATATTATATTA +TAAGTAATAATAAGTATTATATTATATATAGCTTTTATAGCTTAGTGGTAAAGCGATAAA +TTGAAGATTTATTTACATGTAGTTCGATTCTCATTAAGGGCAATAATAATAATATATTAA +TTAATAATTAATTTATAATAAATATATTATAATAATTAATATATATATATATAATATATT +TAATACAAAGAAAATATATATTATATCTCTTATTTATTTATTTATTAATATTTTAATAAA +TATAATATTATAAAAAAAAGTTTATATATTTAGTTCCGGGGCCCGGCCACGGGAGCCGGA +ACCCCGGTAGGAGAAATATAAATATAAATATAAATATAATATAAGTTTGGTATTCATTTA +ATTATATTATTTAATTAAAAATATTCTAAATAAGAATAAATATCAATAAAGGAGTTATAA +ATATATATATATATTAATATATATATAAAAATATATATTATTATTAGTTCCCGCTTTGCG +GGAACCCCGTAAGGAGTGAGGGACCCCATGGGAACCGAACCCCTATTTAAGAAGGAGTTT +TATTATAATAAAATTTATATATATTTAATATATAATTATAAAAATATTATATAATAAATA +ATAAATAATTATTAATAATAAATAAATATAATAATAATATTATAATAAATTTATAAATGA +TTATAATAAATTTATATTAATTTTTTATTTTGTAAATACTAAGATTTGAACTTAGATAAT +ATGCACCTAAAAACATACATTTTACCATTAAATTATATTTACCTTATTAATTATATAAAA +TTTATTAAATATATAATATATTAATTATATAAAAATTATTAAATAAATATATAATATATT +ATATATAATTTATAATATATATATTATAAATATTATTATATATAAAATATAATATACTAC +TTATAAAAATATATATATATATATAAATATATATATAAATAAATATTTTATATATTAAAT +TAAATAATTATTAATAAATTTAATTATAAAGTATAATTTTCAATAGGAATATTTATAAGA +TTATAATAATTATATGAATTATTATAATTATATATATATAAATAAATAAAATAATAATTA +TAATAATTAATAAGAGTTTTGGATATATATCTGTGGAGTATATATTTTATAAAGGAGATT +AGCTTAATTGGTATAGCATTCGTTTTACACACGAAAGATTATAGGTTCGAACCCTATATT +TCCTAAATCTAGATATAATATTATATCTATCTTAATATAATAATATTTATTTATTAAATA +AAAAAAAAATAAATAATATTAATTAATATAAGATTCTTTTTTAATTATAATAATAAATAA +ATAAAAAGAAGATATTATCAATGATTTATATTAATAATAAATATAAATAATAAAAAATAT +ATATAATATAATATAATAAATATATTTCCTTTAATATTAATAAATTAATAATAATAATAA +TAATAATAATAAAATATTTAAATAAATTATATTCAATACAAATTAATTATTTATATTATT +AATAATTGAATAAATAATCCGGTCGAAAGAGATATTAATTCGATTATATTATTTATTTAA +TTATATTTAATTTAAATATATAAATTAATATATATATATTGAATTATATATAAATTTATT +TTATAATTTTATAAATAATATATTATTATAAATATTTAATATAATTTATATTATTATTAA +ATAAAAGATTTATTAAATTAATATTATTATTTAATTTTATTATATAGTTTAAGGGATAAT +ATTTTATTAATATTTTTTTTATTTATTTATTTAATTATATTATATATATAATATATATAT +AACAATAAATTTATGACACATTTAGAAAGAAGTAGACATCAACAACATCCATTTCATATG +GTTATGCCTTCACCATGACCTATTGTAGTATCATTTGCATTATTATCATTAGCATTATCA +CTAGCATTAACAATGCATGGTTATATTGGTAATATGAATATGGTATATTTAGCATTATTT +GTATTATTAACAAGTTCTATTTTATGATTTAGAGATATTGTAGCTGAAGCTACATATTTA +GGTGATCATACTATAGCAGTAAGAAAAGGTATTAATTTAGGTTTCTTAATGTTTGTATTA +TCTGAAGTATTAATCTTTGCTGGTTTATTCTGAGCTTATTTCCATTCAGCTATGAGTCCT +GATGTACTATTAGGTGCATGTTGACCACCCGTAGGTATTGAAGCTGTACAACCTACCGAA +TTACCTTTATTAAATACTATTATCTTATTATCTTCTGGTGCTACTGTAACTTATAGTCAT +CATGCCTTAATCGCAGGTAATAGAAATAAAGCCTTATCAGGTTTATTAATTACATTCTGA +TTAATTGTTATTTTTGTTACTTGTCAATATATTGAATATACTAATGCTGCATTCACTATC +TCTGATGGTGTTTATGGTTCAGTATTCTATGCTGGTACAGGATTACATTTCTTACATATG +GTAATGTTAGCAGCTATGTTAGGTGTTAATTATTGAAGAATGAGAAATTATCATTTAACA +GCTGGACATCATGTTGGATATGAAACAACTATTATTTATCTACATGTTTTAGATGTTATC +TGATTATTTTTATACGTAGTCTTCTACTGATGAGGAGTCTAAGGCTATAGAATTATATAT +CTAAATGATTAATATATATATTATTAATAATTAACAATAATTAATATATTATAATTTATA +TATATATATTTTATATTATTATAATAATATTCTTACAAATATAATTATTATATATTATTC +CTTCAAAACTCCTAACGGGGTTCCCGCGAAGCGGGAACTAATAATAATATAATCATTATA +CTCTTTTTTCATTTACCTTTTATAAAGATAATTAATAAATTTATTTAATATTTATAAAAA +AAAAAATATAATATTAATATAATATAATATAATAATGTAATTATTTATATTTTTATATTC +CTTCGAGGTCACCGCCTCACCTCCAGCGGGACTTTTTTAATATGATATAATATAATATAA +ATATTATTAATTTAACTAATATATAAATTCATATATATATATATATTATTAATATTATTT +TATAAAAAATATTTTTTATTTGATTATTATTAAATATTATATAGTTCCGGGGCCCGGCCA +CGGGAGCCGGAACCCCGAAAGGAGAAATATTAATATATTATAAATATACTATTTATGTAA +TTATTTTTTGAAGTGAGCACCTATTTTATATATATTTTATATATATTTTATTATATTTTA +TTAAAAATAGGTGTGAACCTCCATGAGAGAGGAATGAATACCTATTTTATAAAGTATATT +TTATATTCTATATATTATAAATATGAACCAAAAAAAGGAGTTTAAAATTTAATTAAATTT +AATTAATTGAATTTCTTTATTATTATTATCATAATTATTAAACCCTTTATTAATATAATA +ATATATTATTTATTATCAAAATACCTACCCTTTTTATAATTTATATCTTTAATAATATAA +TTAAATATAAAATGTTTATTAAATATTATATAAAAATAAAAATAAAAATATATATATATA +TATAAATGATAAATAATAAGGAATTCACACTTATATAAATTTAAATATAAAGTCCCAAAA +GAAGTATTCATTAAATAAATTATCATTAATTAATTATAATAAACTTATTTAATATTATTA +AAGATTAATTTATAATAATAATTATTATTATTATTATTAATATTAATAAAATATATAAAT +AATTAAATAGTTCATATATTAAAAAGAATTAGAATTAAACTTTAATAAGTGTATTTAATA +TATAGAATATTAATAGAATATTTATTCTATTTATATATATATTTATATATATATATATTA +AATAATATTATTTATATTATATTTTATATATATATTATTAATATAAAAAGTATATTATAT +GTATTATATATATTATATATTATATATTTAATAATATATTACTCCTTTGGGGTGGGTCCG +CCCCACGGGGCGGGCCGGACTATTATAATTAATAATTTTATAAAGTTCCGGGGCCCGGCC +ACGGGAGCCGGAACCCCGAAAGGAGAATAAATAATTATATATCTTCTTCTTAATTAAATT +AAATTAAATTAAATTAAATTAAATTAAATTAAATTAAAAAGGGGTTCGGTCCCCCTCCCT +AACGGGAGGGGGTCCCTCACTCATTCAAACTATAATTTAATATATTATGATATTATTTAT +AATTTATAATATAATGTATAATATTATATTATAAATATTATATAAAAATAAAATGATATA +TATAATAATAATAATAATAATAATAAAAAAATAGAAAAGAATAATTTTTATTATTTTAGT +ATATATAAGAATTTAATAAGTTATATTATTGCGGACACCGTTACGCGGAGTGGGGACTAT +TATATTTTACCTATATATATTAATATTATTATAATTTCCTTCTTTAAAAGAAAAAAGGAA +TTCGAGAACTTATTATTATATTAATATATTAATAATAAATAATAATAAATAATAAAAAAG +TAAATAATTATAAATTATATAAAAATATAATTTTATTATTAAGAAAGGAGTTTAAATATA +AAATATAATATTATCATTAAGTTCTAATAAAGGTATATAATGAAGATCTATTAGAACCTA +AAAAGAATATTAATATATCTATTATAAAATAATAATAATAAATATAAATATAAAAATAAA +TTGTAATATTTATAAATAATAATAAAAAATAAATAAGGAATATATTAATTATTAATAATA +AATAAATTATATTAAAATATAATATTATTATTAAATTAAAGAATTATATTAAATATATTT +ATTAAAATTTTATAAATAAGTTAATATTTTATTAAATAATATTTATAAATAATAAAAAAA +AATAAGTATATAATTATTAATATATTAATTTATTATGTTATATATTTATATATTTCAAAT +ATATAAGTAATAGGGGGAGGGGGTGGGTGATAATAACCAGAATATTAAATAAATACAGAG +CACACATTTGTTAATATTTAATAATATAATCAATAAATATATTATAATAATATAATATAA +TTAATAATAGATATAAAGTATAAACAATATAATAAATTATATAAAATAAATATAAATTAA +AAATAATAACCAAATAATTAATATAATAAATGATAAACAAGAAGATATCCGGGTCCCAAT +AATAATTATTATTGAAAATAATAATTGGGACCCCCACAATAGAATAAAAAATAAAAAGAA +TTAATAATATATAAATAATATAAAATATATTATATATATATATAATATATATATATATAT +AATAAAAAAAAATATATAATATAATATATATATATAAAATAATAAATTATATATATATAT +ATAAAATAATAAAAAATAATAATCATATGAATTTTATAAATATAATTATTATTAATAATA +ATAATAATAATAATAAAGTCCGGTCCGCCCCGCGGAGGGGGCGGACCCCCGAAGGAGTGC +GGGACCCCGTGGGAACCGCATCCCTTTTTATTCTTAATTAAGAAGGAGATAATAATTTAT +AAAAATTAATATTTATTTTATGTAATATTAATATTAATATTAATATAATATAATATAATA +TAATACGGATTAAATATTACCAGTTGTTCACAGGTAATATAAAATCCTATTGTTTCACCT +ATTATTAATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGAATAA +GTATATATAATAAAATTTAATAAAAAAAAATAATTATATAATAAATATATATATTATAAT +ATTATATAAATATAAAATATAATTGATATTAACATTATATAATTAATAATATAATCAAAT +AATATAAATATAATATAAAAAGTTTTAATTATTAAAATTATATAAATATTATTTAATAAA +AATAAAAATAATAATAATAATAATAATAATAAAGTCCGGTCCGCCCCCTCCGCGGAGGGG +GCGGACCCCGAAAGAGTGAGGGACCCCCCCGTATACTTACGGGGGGAGAACCGAACCCCT +TTTTTTATTTAAAGAAGGAGATAAATATTTATATCTTTATTTATAATTATATATAAATAA +AAGTTTATTAAAATTTATAATAATAATATAAAAAAGTATATAATAAATTTATTATAAATA +AATAAATATTTAGTAATAATATTTAATAAAATTATAAATATTATAAATAAAATATTAATA +ATAAATAATAAATATATAATATAATATAATATAATAAATTAATAACAATAAGATATCCGG +GTCCCCTAAATAATTATTATATAAAATAATAATTGGGACCCATACATATAAATATAAAAT +ATTTTAATATTTATATATAAATAATAATAATATATATTTATATTATATTATAATATAACC +CTTTCCAATTAATATTAATATTAATATTAATTACTTCCTTAAAAAAATAATAATTAATTA +ATTGATTTTTATATTAATATAAAAAAGTTAATATATATATTTATATATAAATAATATAAA +TTAATATAAAGATAATAAGTCCCCGCTTTCAGCGCAGTGAGGGACCCCCTCCCGTAAATA +TACGGGAGGGGAGACCGAACCCCAAAGGAATAATAAATAATAGTATGTATTTAAATAAAT +ATTTAATATACTATTTTTTTTTATTATTTTTATAATATATTTATAATAATATATTTAATT +ATAATTTATAAAAAAGAGATATAATATTTTATTATATATAATATTAATATAATACAAATT +AACATTATTTAATTATTATTAATAATATTTAACTTTATTATTATCTTCTACGGTTGGACT +CCTTCTTAAAAAGGGGTTCGGTCCCCCTCCCATTAGGGAGGGGTCCCTCACTCCTTCGGG +GTCCGCGCCCCCCGCGGGGGGGGGCGGACCGGACTATTATTACTATTTATTTATTAATAA +TAAATAATAAATTATAAAGTCACTGAAAGAGTGAGGAATTTTCCTTTTCCCAAGGGAAAA +CCCCAAAGGATAATATAAATATTATAAAATTTTTATTAAATAATATAAAATTCAATAAAA +TAATTTTAATTAATTAATTAATTAATTAATATAAAAATAAATATTTTTAATTAATATTAA +TATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGGAAGGAGAAATATAAATA +TAATAGTATAGTATATAGGAAGTTAATAATAATATAAATATTATATAATATATATATGTA +TATATATTATATTATATAATTAATTTTCTCCTTTTGTATTTACATCTTAATAAAATATAA +AATATAAAATGTTATTAACAATAAAAATTATTAATCTTTATAATATTAATAATAGTAAAT +TTATTTATATATCTCCTTTAGGATGGACTCCTTCGGCCGGACTCCTTCGGGGTCCGCCCC +GCGGGGGCGGGCCGGACTATTTTTATTTTTTTTTTAAAAAATATTAAATATTATAAATAT +ATTATAAATATATTATAAATATGTTATAAATATATTATAAATAGAATATAATATAATATT +ATATATTATAATGATAAAGATTATATATATTTTCTTTTTTTTTTTATTTATTATTTTTAA +TAAGTAAAAATTATATTATATATATATATATATTAGATTTTATAAGTAATATAATATAAG +TATTAATATATAAATGCAATATGATGTAATTGGTTAACATTTTAGGGTCATGACCTAATT +ATATACGTTCAAATCGTATTATTGCTAATAAATTAATATATAATATTTATAAAAAAGTAT +AATAAAATATATTATAAGAAGAATATATTATATAATAATTATATTAATAATATTAATAAA +TAATATATAAATAATTATAAAAAAGTATATAATATTAATCAATTAATTAATTAATAAATA +TAAATAATATATTAATTTTTAATTAATTTGAATAAGATATTTATATTATTAATAGGAAAG +TCATAAATATATAAATTATATTATATAATTAATATAATAATAAAATAAATTATATATTTT +ATTTATAATATTATTTCTTTATAAGATAAAATATTATCTGATGAATAATTAGATTGAATA +ATATTTATAAAGAAATATATATAAAAAGTCATTATATAAATTTAATTATAATTTAAATAA +ATTTTATATAAATTAATATAATATTAATAAAGTAATTAGTATAAATAAATAATATGAAAA +TAAAACTTAATAAATATATAAATATAGTCCGGCCCGCCCCCCCGCGGCGGGCGGACCCCG +AAGGAGTGAGGGACCCCTCCCTAATGGGAGGGGGACCGAACCCCTTTTTAAGAAGGAGTC +CATATATATATATTAATAAAAAAAAGTAATATATATATATATATTGGAATAGTTATATTA +TTATACAGAAATATGCTTAATTATAATATAATATCCATA diff --git a/inferelator_prior/tests/artifacts/sc64_mito.gtf b/inferelator_prior/tests/artifacts/sc64_mito.gtf new file mode 100644 index 0000000..98d6b11 --- /dev/null +++ b/inferelator_prior/tests/artifacts/sc64_mito.gtf @@ -0,0 +1,317 @@ +#!genome-build R64-1-1 +#!genome-version R64-1-1 +#!genome-date 2011-09 +#!genome-build-accession GCA_000146045.2 +#!genebuild-last-updated 2018-10 +Mito sgd gene 731 802 . + . gene_id "tP(UGG)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 731 802 . + . gene_id "tP(UGG)Q"; transcript_id "tP(UGG)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 731 802 . + . gene_id "tP(UGG)Q"; transcript_id "tP(UGG)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tP(UGG)Q_tRNA-E1"; +Mito sgd gene 3952 4338 . + . gene_id "Q0010"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 3952 4338 . + . gene_id "Q0010"; transcript_id "Q0010_mRNA"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 3952 4338 . + . gene_id "Q0010"; transcript_id "Q0010_mRNA"; exon_number "1"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0010_mRNA-E1"; +Mito sgd CDS 3952 4335 . + 0 gene_id "Q0010"; transcript_id "Q0010_mRNA"; exon_number "1"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0010"; +Mito sgd start_codon 3952 3954 . + 0 gene_id "Q0010"; transcript_id "Q0010_mRNA"; exon_number "1"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 4336 4338 . + 0 gene_id "Q0010"; transcript_id "Q0010_mRNA"; exon_number "1"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 4254 4415 . + . gene_id "Q0017"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 4254 4415 . + . gene_id "Q0017"; transcript_id "Q0017_mRNA"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 4254 4415 . + . gene_id "Q0017"; transcript_id "Q0017_mRNA"; exon_number "1"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0017_mRNA-E1"; +Mito sgd CDS 4254 4412 . + 0 gene_id "Q0017"; transcript_id "Q0017_mRNA"; exon_number "1"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0017"; +Mito sgd start_codon 4254 4256 . + 0 gene_id "Q0017"; transcript_id "Q0017_mRNA"; exon_number "1"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 4413 4415 . + 0 gene_id "Q0017"; transcript_id "Q0017_mRNA"; exon_number "1"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 6546 8194 . + . gene_id "Q0020"; gene_name "15S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; +Mito sgd transcript 6546 8194 . + . gene_id "Q0020"; transcript_id "Q0020_rRNA"; gene_name "15S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; transcript_source "sgd"; transcript_biotype "rRNA"; +Mito sgd exon 6546 8194 . + . gene_id "Q0020"; transcript_id "Q0020_rRNA"; exon_number "1"; gene_name "15S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; transcript_source "sgd"; transcript_biotype "rRNA"; exon_id "Q0020_rRNA-E1"; +Mito sgd gene 9374 9447 . + . gene_id "tW(UCA)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 9374 9447 . + . gene_id "tW(UCA)Q"; transcript_id "tW(UCA)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 9374 9447 . + . gene_id "tW(UCA)Q"; transcript_id "tW(UCA)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tW(UCA)Q_tRNA-E1"; +Mito sgd gene 11667 11957 . + . gene_id "Q0032"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 11667 11957 . + . gene_id "Q0032"; transcript_id "Q0032_mRNA"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 11667 11957 . + . gene_id "Q0032"; transcript_id "Q0032_mRNA"; exon_number "1"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0032_mRNA-E1"; +Mito sgd CDS 11667 11954 . + 0 gene_id "Q0032"; transcript_id "Q0032_mRNA"; exon_number "1"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0032"; +Mito sgd start_codon 11667 11669 . + 0 gene_id "Q0032"; transcript_id "Q0032_mRNA"; exon_number "1"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 11955 11957 . + 0 gene_id "Q0032"; transcript_id "Q0032_mRNA"; exon_number "1"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 13818 26701 . + . gene_id "Q0045"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 13818 26701 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 13818 13986 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "1"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E1"; +Mito sgd CDS 13818 13986 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "1"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; +Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "1"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 16435 16470 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "2"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E2"; +Mito sgd CDS 16435 16470 . + 2 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "2"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; +Mito sgd exon 18954 18991 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "3"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E3"; +Mito sgd CDS 18954 18991 . + 2 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "3"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; +Mito sgd exon 20508 20984 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "4"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E4"; +Mito sgd CDS 20508 20984 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "4"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; +Mito sgd exon 21995 22246 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "5"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E5"; +Mito sgd CDS 21995 22246 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "5"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; +Mito sgd exon 23612 23746 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "6"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E6"; +Mito sgd CDS 23612 23746 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "6"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; +Mito sgd exon 25318 25342 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "7"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E7"; +Mito sgd CDS 25318 25342 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "7"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; +Mito sgd exon 26229 26701 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "8"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E8"; +Mito sgd CDS 26229 26698 . + 2 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "8"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; +Mito sgd stop_codon 26699 26701 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "8"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 13818 19996 . + . gene_id "Q0060"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 13818 19996 . + . gene_id "Q0060"; transcript_id "Q0060_mRNA"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 13818 13986 . + . gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "1"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0060_mRNA-E1"; +Mito sgd CDS 13818 13986 . + 0 gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "1"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0060"; +Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "1"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 16435 16470 . + . gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "2"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0060_mRNA-E2"; +Mito sgd CDS 16435 16470 . + 2 gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "2"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0060"; +Mito sgd exon 18954 19996 . + . gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "3"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0060_mRNA-E3"; +Mito sgd CDS 18954 19993 . + 2 gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "3"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0060"; +Mito sgd stop_codon 19994 19996 . + 0 gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "3"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 13818 16322 . + . gene_id "Q0050"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 13818 16322 . + . gene_id "Q0050"; transcript_id "Q0050_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 13818 16322 . + . gene_id "Q0050"; transcript_id "Q0050_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0050_mRNA-E1"; +Mito sgd CDS 13818 16319 . + 0 gene_id "Q0050"; transcript_id "Q0050_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0050"; +Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0050"; transcript_id "Q0050_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 16320 16322 . + 0 gene_id "Q0050"; transcript_id "Q0050_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 13818 18830 . + . gene_id "Q0055"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 13818 18830 . + . gene_id "Q0055"; transcript_id "Q0055_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 13818 13986 . + . gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0055_mRNA-E1"; +Mito sgd CDS 13818 13986 . + 0 gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0055"; +Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 16435 18830 . + . gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0055_mRNA-E2"; +Mito sgd CDS 16435 18827 . + 2 gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0055"; +Mito sgd stop_codon 18828 18830 . + 0 gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 13818 23167 . + . gene_id "Q0070"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 13818 23167 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 13818 13986 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0070_mRNA-E1"; +Mito sgd CDS 13818 13986 . + 0 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0070"; +Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 16435 16470 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0070_mRNA-E2"; +Mito sgd CDS 16435 16470 . + 2 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0070"; +Mito sgd exon 18954 18991 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0070_mRNA-E3"; +Mito sgd CDS 18954 18991 . + 2 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0070"; +Mito sgd exon 20508 20984 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0070_mRNA-E4"; +Mito sgd CDS 20508 20984 . + 0 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0070"; +Mito sgd exon 21995 23167 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "5"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0070_mRNA-E5"; +Mito sgd CDS 21995 23164 . + 0 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "5"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0070"; +Mito sgd stop_codon 23165 23167 . + 0 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "5"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 13818 21935 . + . gene_id "Q0065"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 13818 21935 . + . gene_id "Q0065"; transcript_id "Q0065_mRNA"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 13818 13986 . + . gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "1"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0065_mRNA-E1"; +Mito sgd CDS 13818 13986 . + 0 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "1"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0065"; +Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "1"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 16435 16470 . + . gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "2"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0065_mRNA-E2"; +Mito sgd CDS 16435 16470 . + 2 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "2"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0065"; +Mito sgd exon 18954 18991 . + . gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "3"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0065_mRNA-E3"; +Mito sgd CDS 18954 18991 . + 2 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "3"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0065"; +Mito sgd exon 20508 21935 . + . gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "4"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0065_mRNA-E4"; +Mito sgd CDS 20508 21932 . + 0 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "4"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0065"; +Mito sgd stop_codon 21933 21935 . + 0 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "4"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 24156 25255 . + . gene_id "Q0075"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 24156 25255 . + . gene_id "Q0075"; transcript_id "Q0075_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 24156 24870 . + . gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0075_mRNA-E1"; +Mito sgd CDS 24156 24870 . + 0 gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0075"; +Mito sgd start_codon 24156 24158 . + 0 gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 24906 25255 . + . gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0075_mRNA-E2"; +Mito sgd CDS 24906 25252 . + 2 gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0075"; +Mito sgd stop_codon 25253 25255 . + 0 gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 27666 27812 . + . gene_id "Q0080"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 27666 27812 . + . gene_id "Q0080"; transcript_id "Q0080_mRNA"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 27666 27812 . + . gene_id "Q0080"; transcript_id "Q0080_mRNA"; exon_number "1"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0080_mRNA-E1"; +Mito sgd CDS 27666 27809 . + 0 gene_id "Q0080"; transcript_id "Q0080_mRNA"; exon_number "1"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0080"; +Mito sgd start_codon 27666 27668 . + 0 gene_id "Q0080"; transcript_id "Q0080_mRNA"; exon_number "1"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 27810 27812 . + 0 gene_id "Q0080"; transcript_id "Q0080_mRNA"; exon_number "1"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 28487 29266 . + . gene_id "Q0085"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 28487 29266 . + . gene_id "Q0085"; transcript_id "Q0085_mRNA"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 28487 29266 . + . gene_id "Q0085"; transcript_id "Q0085_mRNA"; exon_number "1"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0085_mRNA-E1"; +Mito sgd CDS 28487 29263 . + 0 gene_id "Q0085"; transcript_id "Q0085_mRNA"; exon_number "1"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0085"; +Mito sgd start_codon 28487 28489 . + 0 gene_id "Q0085"; transcript_id "Q0085_mRNA"; exon_number "1"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 29264 29266 . + 0 gene_id "Q0085"; transcript_id "Q0085_mRNA"; exon_number "1"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 30874 31014 . + . gene_id "Q0092"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 30874 31014 . + . gene_id "Q0092"; transcript_id "Q0092_mRNA"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 30874 31014 . + . gene_id "Q0092"; transcript_id "Q0092_mRNA"; exon_number "1"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0092_mRNA-E1"; +Mito sgd CDS 30874 31011 . + 0 gene_id "Q0092"; transcript_id "Q0092_mRNA"; exon_number "1"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0092"; +Mito sgd start_codon 30874 30876 . + 0 gene_id "Q0092"; transcript_id "Q0092_mRNA"; exon_number "1"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 31012 31014 . + 0 gene_id "Q0092"; transcript_id "Q0092_mRNA"; exon_number "1"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 35373 35444 . + . gene_id "tE(UUC)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 35373 35444 . + . gene_id "tE(UUC)Q"; transcript_id "tE(UUC)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 35373 35444 . + . gene_id "tE(UUC)Q"; transcript_id "tE(UUC)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tE(UUC)Q_tRNA-E1"; +Mito sgd gene 36540 42251 . + . gene_id "Q0120"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 36540 42251 . + . gene_id "Q0120"; transcript_id "Q0120_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 36540 36954 . + . gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0120_mRNA-E1"; +Mito sgd CDS 36540 36954 . + 0 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0120"; +Mito sgd start_codon 36540 36542 . + 0 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 37723 37736 . + . gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0120_mRNA-E2"; +Mito sgd CDS 37723 37736 . + 2 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0120"; +Mito sgd exon 39141 39217 . + . gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0120_mRNA-E3"; +Mito sgd CDS 39141 39217 . + 0 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0120"; +Mito sgd exon 40841 42251 . + . gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0120_mRNA-E4"; +Mito sgd CDS 40841 42248 . + 1 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0120"; +Mito sgd stop_codon 42249 42251 . + 0 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 36540 38579 . + . gene_id "Q0110"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 36540 38579 . + . gene_id "Q0110"; transcript_id "Q0110_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 36540 36954 . + . gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0110_mRNA-E1"; +Mito sgd CDS 36540 36954 . + 0 gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0110"; +Mito sgd start_codon 36540 36542 . + 0 gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 37723 38579 . + . gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0110_mRNA-E2"; +Mito sgd CDS 37723 38576 . + 2 gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0110"; +Mito sgd stop_codon 38577 38579 . + 0 gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 36540 40265 . + . gene_id "Q0115"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 36540 40265 . + . gene_id "Q0115"; transcript_id "Q0115_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 36540 36954 . + . gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0115_mRNA-E1"; +Mito sgd CDS 36540 36954 . + 0 gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0115"; +Mito sgd start_codon 36540 36542 . + 0 gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 37723 37736 . + . gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0115_mRNA-E2"; +Mito sgd CDS 37723 37736 . + 2 gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0115"; +Mito sgd exon 39141 40265 . + . gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0115_mRNA-E3"; +Mito sgd CDS 39141 40262 . + 0 gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0115"; +Mito sgd stop_codon 40263 40265 . + 0 gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 36540 43647 . + . gene_id "Q0105"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 36540 43647 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 36540 36954 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "1"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E1"; +Mito sgd CDS 36540 36954 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "1"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; +Mito sgd start_codon 36540 36542 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "1"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 37723 37736 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "2"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E2"; +Mito sgd CDS 37723 37736 . + 2 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "2"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; +Mito sgd exon 39141 39217 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "3"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E3"; +Mito sgd CDS 39141 39217 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "3"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; +Mito sgd exon 40841 41090 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "4"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E4"; +Mito sgd CDS 40841 41090 . + 1 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "4"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; +Mito sgd exon 42508 42558 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "5"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E5"; +Mito sgd CDS 42508 42558 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "5"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; +Mito sgd exon 43297 43647 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "6"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E6"; +Mito sgd CDS 43297 43644 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "6"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; +Mito sgd stop_codon 43645 43647 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "6"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 46723 46953 . + . gene_id "Q0130"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 46723 46953 . + . gene_id "Q0130"; transcript_id "Q0130_mRNA"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 46723 46953 . + . gene_id "Q0130"; transcript_id "Q0130_mRNA"; exon_number "1"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0130_mRNA-E1"; +Mito sgd CDS 46723 46950 . + 0 gene_id "Q0130"; transcript_id "Q0130_mRNA"; exon_number "1"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0130"; +Mito sgd start_codon 46723 46725 . + 0 gene_id "Q0130"; transcript_id "Q0130_mRNA"; exon_number "1"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 46951 46953 . + 0 gene_id "Q0130"; transcript_id "Q0130_mRNA"; exon_number "1"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 48201 48290 . + . gene_id "tS(UGA)Q2"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 48201 48290 . + . gene_id "tS(UGA)Q2"; transcript_id "tS(UGA)Q2_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 48201 48290 . + . gene_id "tS(UGA)Q2"; transcript_id "tS(UGA)Q2_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tS(UGA)Q2_tRNA-E1"; +Mito sgd gene 48901 50097 . + . gene_id "Q0140"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 48901 50097 . + . gene_id "Q0140"; transcript_id "Q0140_mRNA"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 48901 50097 . + . gene_id "Q0140"; transcript_id "Q0140_mRNA"; exon_number "1"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0140_mRNA-E1"; +Mito sgd CDS 48901 50094 . + 0 gene_id "Q0140"; transcript_id "Q0140_mRNA"; exon_number "1"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0140"; +Mito sgd start_codon 48901 48903 . + 0 gene_id "Q0140"; transcript_id "Q0140_mRNA"; exon_number "1"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 50095 50097 . + 0 gene_id "Q0140"; transcript_id "Q0140_mRNA"; exon_number "1"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 51052 51228 . + . gene_id "Q0142"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 51052 51228 . + . gene_id "Q0142"; transcript_id "Q0142_mRNA"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 51052 51228 . + . gene_id "Q0142"; transcript_id "Q0142_mRNA"; exon_number "1"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0142_mRNA-E1"; +Mito sgd CDS 51052 51225 . + 0 gene_id "Q0142"; transcript_id "Q0142_mRNA"; exon_number "1"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0142"; +Mito sgd start_codon 51052 51054 . + 0 gene_id "Q0142"; transcript_id "Q0142_mRNA"; exon_number "1"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 51226 51228 . + 0 gene_id "Q0142"; transcript_id "Q0142_mRNA"; exon_number "1"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 51277 51429 . + . gene_id "Q0143"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 51277 51429 . + . gene_id "Q0143"; transcript_id "Q0143_mRNA"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 51277 51429 . + . gene_id "Q0143"; transcript_id "Q0143_mRNA"; exon_number "1"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0143_mRNA-E1"; +Mito sgd CDS 51277 51426 . + 0 gene_id "Q0143"; transcript_id "Q0143_mRNA"; exon_number "1"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0143"; +Mito sgd start_codon 51277 51279 . + 0 gene_id "Q0143"; transcript_id "Q0143_mRNA"; exon_number "1"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 51427 51429 . + 0 gene_id "Q0143"; transcript_id "Q0143_mRNA"; exon_number "1"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 54109 54438 . + . gene_id "Q0144"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 54109 54438 . + . gene_id "Q0144"; transcript_id "Q0144_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 54109 54438 . + . gene_id "Q0144"; transcript_id "Q0144_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0144_mRNA-E1"; +Mito sgd CDS 54109 54435 . + 0 gene_id "Q0144"; transcript_id "Q0144_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0144"; +Mito sgd start_codon 54109 54111 . + 0 gene_id "Q0144"; transcript_id "Q0144_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 54436 54438 . + 0 gene_id "Q0144"; transcript_id "Q0144_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 58009 62447 . + . gene_id "Q0158"; gene_name "21S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; +Mito sgd transcript 58009 62447 . + . gene_id "Q0158"; transcript_id "Q0158_rRNA"; gene_name "21S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; transcript_source "sgd"; transcript_biotype "rRNA"; +Mito sgd exon 58009 60724 . + . gene_id "Q0158"; transcript_id "Q0158_rRNA"; exon_number "1"; gene_name "21S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; transcript_source "sgd"; transcript_biotype "rRNA"; exon_id "Q0158_rRNA-E1"; +Mito sgd exon 61868 62447 . + . gene_id "Q0158"; transcript_id "Q0158_rRNA"; exon_number "2"; gene_name "21S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; transcript_source "sgd"; transcript_biotype "rRNA"; exon_id "Q0158_rRNA-E2"; +Mito sgd gene 61022 61729 . + . gene_id "Q0160"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 61022 61729 . + . gene_id "Q0160"; transcript_id "Q0160_mRNA"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 61022 61729 . + . gene_id "Q0160"; transcript_id "Q0160_mRNA"; exon_number "1"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0160_mRNA-E1"; +Mito sgd CDS 61022 61726 . + 0 gene_id "Q0160"; transcript_id "Q0160_mRNA"; exon_number "1"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0160"; +Mito sgd start_codon 61022 61024 . + 0 gene_id "Q0160"; transcript_id "Q0160_mRNA"; exon_number "1"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 61727 61729 . + 0 gene_id "Q0160"; transcript_id "Q0160_mRNA"; exon_number "1"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 63862 63937 . + . gene_id "tT(UGU)Q1"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 63862 63937 . + . gene_id "tT(UGU)Q1"; transcript_id "tT(UGU)Q1_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 63862 63937 . + . gene_id "tT(UGU)Q1"; transcript_id "tT(UGU)Q1_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tT(UGU)Q1_tRNA-E1"; +Mito sgd gene 64415 64490 . + . gene_id "tC(GCA)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 64415 64490 . + . gene_id "tC(GCA)Q"; transcript_id "tC(GCA)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 64415 64490 . + . gene_id "tC(GCA)Q"; transcript_id "tC(GCA)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tC(GCA)Q_tRNA-E1"; +Mito sgd gene 64596 64670 . + . gene_id "tH(GUG)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 64596 64670 . + . gene_id "tH(GUG)Q"; transcript_id "tH(GUG)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 64596 64670 . + . gene_id "tH(GUG)Q"; transcript_id "tH(GUG)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tH(GUG)Q_tRNA-E1"; +Mito sgd gene 65770 66174 . + . gene_id "Q0182"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 65770 66174 . + . gene_id "Q0182"; transcript_id "Q0182_mRNA"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 65770 66174 . + . gene_id "Q0182"; transcript_id "Q0182_mRNA"; exon_number "1"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0182_mRNA-E1"; +Mito sgd CDS 65770 66171 . + 0 gene_id "Q0182"; transcript_id "Q0182_mRNA"; exon_number "1"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0182"; +Mito sgd start_codon 65770 65772 . + 0 gene_id "Q0182"; transcript_id "Q0182_mRNA"; exon_number "1"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 66172 66174 . + 0 gene_id "Q0182"; transcript_id "Q0182_mRNA"; exon_number "1"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 66095 66179 . + . gene_id "tL(UAA)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 66095 66179 . + . gene_id "tL(UAA)Q"; transcript_id "tL(UAA)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 66095 66179 . + . gene_id "tL(UAA)Q"; transcript_id "tL(UAA)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tL(UAA)Q_tRNA-E1"; +Mito sgd gene 66210 66285 . + . gene_id "tQ(UUG)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 66210 66285 . + . gene_id "tQ(UUG)Q"; transcript_id "tQ(UUG)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 66210 66285 . + . gene_id "tQ(UUG)Q"; transcript_id "tQ(UUG)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tQ(UUG)Q_tRNA-E1"; +Mito sgd gene 67061 67134 . + . gene_id "tK(UUU)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 67061 67134 . + . gene_id "tK(UUU)Q"; transcript_id "tK(UUU)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 67061 67134 . + . gene_id "tK(UUU)Q"; transcript_id "tK(UUU)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tK(UUU)Q_tRNA-E1"; +Mito sgd gene 67309 67381 . + . gene_id "tR(UCU)Q1"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 67309 67381 . + . gene_id "tR(UCU)Q1"; transcript_id "tR(UCU)Q1_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 67309 67381 . + . gene_id "tR(UCU)Q1"; transcript_id "tR(UCU)Q1_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tR(UCU)Q1_tRNA-E1"; +Mito sgd gene 67468 67542 . + . gene_id "tG(UCC)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 67468 67542 . + . gene_id "tG(UCC)Q"; transcript_id "tG(UCC)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 67468 67542 . + . gene_id "tG(UCC)Q"; transcript_id "tG(UCC)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tG(UCC)Q_tRNA-E1"; +Mito sgd gene 68322 68396 . + . gene_id "tD(GUC)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 68322 68396 . + . gene_id "tD(GUC)Q"; transcript_id "tD(GUC)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 68322 68396 . + . gene_id "tD(GUC)Q"; transcript_id "tD(GUC)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tD(GUC)Q_tRNA-E1"; +Mito sgd gene 69203 69288 . + . gene_id "tS(GCU)Q1"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 69203 69288 . + . gene_id "tS(GCU)Q1"; transcript_id "tS(GCU)Q1_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 69203 69288 . + . gene_id "tS(GCU)Q1"; transcript_id "tS(GCU)Q1_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tS(GCU)Q1_tRNA-E1"; +Mito sgd gene 69289 69362 . + . gene_id "tR(ACG)Q2"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 69289 69362 . + . gene_id "tR(ACG)Q2"; transcript_id "tR(ACG)Q2_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 69289 69362 . + . gene_id "tR(ACG)Q2"; transcript_id "tR(ACG)Q2_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tR(ACG)Q2_tRNA-E1"; +Mito sgd gene 69846 69921 . + . gene_id "tA(UGC)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 69846 69921 . + . gene_id "tA(UGC)Q"; transcript_id "tA(UGC)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 69846 69921 . + . gene_id "tA(UGC)Q"; transcript_id "tA(UGC)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tA(UGC)Q_tRNA-E1"; +Mito sgd gene 70162 70237 . + . gene_id "tI(GAU)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 70162 70237 . + . gene_id "tI(GAU)Q"; transcript_id "tI(GAU)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 70162 70237 . + . gene_id "tI(GAU)Q"; transcript_id "tI(GAU)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tI(GAU)Q_tRNA-E1"; +Mito sgd gene 70824 70907 . + . gene_id "tY(GUA)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 70824 70907 . + . gene_id "tY(GUA)Q"; transcript_id "tY(GUA)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 70824 70907 . + . gene_id "tY(GUA)Q"; transcript_id "tY(GUA)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tY(GUA)Q_tRNA-E1"; +Mito sgd gene 71433 71503 . + . gene_id "tN(GUU)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 71433 71503 . + . gene_id "tN(GUU)Q"; transcript_id "tN(GUU)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 71433 71503 . + . gene_id "tN(GUU)Q"; transcript_id "tN(GUU)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tN(GUU)Q_tRNA-E1"; +Mito sgd gene 72630 72705 . + . gene_id "tM(CAU)Q1"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 72630 72705 . + . gene_id "tM(CAU)Q1"; transcript_id "tM(CAU)Q1_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 72630 72705 . + . gene_id "tM(CAU)Q1"; transcript_id "tM(CAU)Q1_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tM(CAU)Q1_tRNA-E1"; +Mito sgd gene 73758 74513 . + . gene_id "Q0250"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 73758 74513 . + . gene_id "Q0250"; transcript_id "Q0250_mRNA"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 73758 74513 . + . gene_id "Q0250"; transcript_id "Q0250_mRNA"; exon_number "1"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0250_mRNA-E1"; +Mito sgd CDS 73758 74510 . + 0 gene_id "Q0250"; transcript_id "Q0250_mRNA"; exon_number "1"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0250"; +Mito sgd start_codon 73758 73760 . + 0 gene_id "Q0250"; transcript_id "Q0250_mRNA"; exon_number "1"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 74511 74513 . + 0 gene_id "Q0250"; transcript_id "Q0250_mRNA"; exon_number "1"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 74495 75984 . + . gene_id "Q0255"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 74495 75984 . + . gene_id "Q0255"; transcript_id "Q0255_mRNA"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 74495 75622 . + . gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "1"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0255_mRNA-E1"; +Mito sgd CDS 74495 75622 . + 0 gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "1"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0255"; +Mito sgd start_codon 74495 74497 . + 0 gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "1"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 75663 75872 . + . gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "2"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0255_mRNA-E2"; +Mito sgd CDS 75663 75872 . + 0 gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "2"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0255"; +Mito sgd exon 75904 75984 . + . gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "3"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0255_mRNA-E3"; +Mito sgd CDS 75904 75981 . + 0 gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "3"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0255"; +Mito sgd stop_codon 75982 75984 . + 0 gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "3"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 77431 77505 . + . gene_id "tF(GAA)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 77431 77505 . + . gene_id "tF(GAA)Q"; transcript_id "tF(GAA)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 77431 77505 . + . gene_id "tF(GAA)Q"; transcript_id "tF(GAA)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tF(GAA)Q_tRNA-E1"; +Mito sgd gene 78089 78162 . - . gene_id "tT(UAG)Q2"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 78089 78162 . - . gene_id "tT(UAG)Q2"; transcript_id "tT(UAG)Q2_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 78089 78162 . - . gene_id "tT(UAG)Q2"; transcript_id "tT(UAG)Q2_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tT(UAG)Q2_tRNA-E1"; +Mito sgd gene 78533 78608 . + . gene_id "tV(UAC)Q"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 78533 78608 . + . gene_id "tV(UAC)Q"; transcript_id "tV(UAC)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 78533 78608 . + . gene_id "tV(UAC)Q"; transcript_id "tV(UAC)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tV(UAC)Q_tRNA-E1"; +Mito sgd gene 79213 80022 . + . gene_id "Q0275"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 79213 80022 . + . gene_id "Q0275"; transcript_id "Q0275_mRNA"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 79213 80022 . + . gene_id "Q0275"; transcript_id "Q0275_mRNA"; exon_number "1"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0275_mRNA-E1"; +Mito sgd CDS 79213 80019 . + 0 gene_id "Q0275"; transcript_id "Q0275_mRNA"; exon_number "1"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0275"; +Mito sgd start_codon 79213 79215 . + 0 gene_id "Q0275"; transcript_id "Q0275_mRNA"; exon_number "1"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 80020 80022 . + 0 gene_id "Q0275"; transcript_id "Q0275_mRNA"; exon_number "1"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd gene 85035 85112 . + . gene_id "tM(CAU)Q2"; gene_source "sgd"; gene_biotype "tRNA"; +Mito sgd transcript 85035 85112 . + . gene_id "tM(CAU)Q2"; transcript_id "tM(CAU)Q2_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; +Mito sgd exon 85035 85112 . + . gene_id "tM(CAU)Q2"; transcript_id "tM(CAU)Q2_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tM(CAU)Q2_tRNA-E1"; +Mito sgd gene 85295 85777 . + . gene_id "Q0285"; gene_source "sgd"; gene_biotype "ncRNA"; +Mito sgd transcript 85295 85777 . + . gene_id "Q0285"; transcript_id "Q0285_ncRNA"; gene_source "sgd"; gene_biotype "ncRNA"; transcript_source "sgd"; transcript_biotype "ncRNA"; +Mito sgd exon 85295 85777 . + . gene_id "Q0285"; transcript_id "Q0285_ncRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "ncRNA"; transcript_source "sgd"; transcript_biotype "ncRNA"; exon_id "Q0285_ncRNA-E1"; +Mito sgd gene 85554 85709 . + . gene_id "Q0297"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; +Mito sgd transcript 85554 85709 . + . gene_id "Q0297"; transcript_id "Q0297_mRNA"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd exon 85554 85709 . + . gene_id "Q0297"; transcript_id "Q0297_mRNA"; exon_number "1"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0297_mRNA-E1"; +Mito sgd CDS 85554 85706 . + 0 gene_id "Q0297"; transcript_id "Q0297_mRNA"; exon_number "1"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0297"; +Mito sgd start_codon 85554 85556 . + 0 gene_id "Q0297"; transcript_id "Q0297_mRNA"; exon_number "1"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +Mito sgd stop_codon 85707 85709 . + 0 gene_id "Q0297"; transcript_id "Q0297_mRNA"; exon_number "1"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; diff --git a/inferelator_prior/tests/artifacts/test.meme b/inferelator_prior/tests/artifacts/test.meme new file mode 100644 index 0000000..7054798 --- /dev/null +++ b/inferelator_prior/tests/artifacts/test.meme @@ -0,0 +1,23 @@ +MEME version 4 + +ALPHABET= ACGT + +strands: - + + +Background letter frequencies: +A 0.25000 C 0.25000 G 0.25000 T 0.25000 + +MOTIF M00799_2.00 Gata4 + +letter-probability matrix: alength= 4 w= 9 + 0.248650 0.261399 0.241301 0.248650 + 0.392226 0.219607 0.070234 0.317932 + 0.025762 0.011346 0.942829 0.020063 + 0.931184 0.018110 0.009175 0.041532 + 0.020910 0.020496 0.005004 0.953590 + 0.652610 0.066752 0.028247 0.252391 + 0.541666 0.110460 0.158325 0.189549 + 0.196060 0.286696 0.350172 0.167072 + 0.286130 0.225078 0.309961 0.178832 + +URL diff --git a/inferelator_prior/tests/artifacts/test.motif b/inferelator_prior/tests/artifacts/test.motif new file mode 100644 index 0000000..b25c648 --- /dev/null +++ b/inferelator_prior/tests/artifacts/test.motif @@ -0,0 +1,10 @@ +>CAGATAAGG M00799_2.00 5.408651 + 0.2487 0.2614 0.2413 0.2487 + 0.3922 0.2196 0.0702 0.3179 + 0.0258 0.0113 0.9428 0.0201 + 0.9312 0.0181 0.0092 0.0415 + 0.0209 0.0205 0.0050 0.9536 + 0.6526 0.0668 0.0282 0.2524 + 0.5417 0.1105 0.1583 0.1895 + 0.1961 0.2867 0.3502 0.1671 + 0.2861 0.2251 0.3100 0.1788 diff --git a/inferelator_prior/tests/artifacts/test_alignment.fastq.gz b/inferelator_prior/tests/artifacts/test_alignment.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..fd8687da23c17247a6bb8208ec669bf8a7498472 GIT binary patch literal 9201 zcmVZ02o73;%#&F9x%k|Hp`oAeE`^K6#$~p6m4Ot}dw>R}_;B27_7u zbnO58`0vZ>AO7;sugCI8+n=BJpa1iJ9ozEH^=RfCpew^RaQT?HV&9MRp)Z}kp2v!uXViLrucGIjXWPl=i|3Pa zzE;Zl^T26`zU6$Q9{%IM{qc`-KK6b8$9?dyv-osyYW5TG^N)WrE3BV^6<%vHBfJ_T z=;a*tRq;cw;n1g|S7aA+{?~p9-k?{iC(BXoG&PyM`~0$Z9(F{^X#^2k;dZrnY0_`$ zc8ybSg#5iB^e^aN7NHS7BO}bOw7yR(Y+GXmy|a2<=XbrI=N&&io2!RO26 zJbajS>=m_~+CZ&wz(v$b=X>ETcJ%z`Rv6ygA1?IAhxoDc4^Onhylx{iydTf?G{gJT zm_h9S6!%x`UYj@L_KJi9@Q(9l!&RzUtCZp=AM`785`U@cijYb4=1%9gJlT))_D#D? ze&_X9JxAK;$eeL#&|FTcnj`PIR95`xr zi1(Y#+P|3G44;u1-tTle!}+YSf_d;6*s4ar^QTx0M3k^)!GH7?UKwwEa{~NGrdEvoznb{cT!dd5p6}(x;E5i;or)bZOi2*VTXiUd{V05txUliKJRg zIFJj>-F0r+mD@xWy-o-#8yFj!Rj0KQ=h2Q zoDOmRIRA4iyhpyUJRa*ypZAf$uF}>%vZP+3x_C7kSfs3WsNjTh#FDX-u?p<&>)$!_ zj0&HCOPCq-s&BDwy5H#DRqjWwcSls1t`~htET4IlEI;q%e7ZbZX9ySffXJYz<|+1S z)pU6PP#_pqkEp%~Y&n@LSiZp@tz1WlkLP1bmJz4h^&uRbU&(T)Sfb2=hyX7*U*=Qw zfzwq^XTk5;0)*sDD?HOOc82AQkMpy|3>wKa@aY}ZaChK;UxT7Kyj`?8yShiIxaS5I zDlWS?&?-Py_BU)**lM%*XwD$rr8mzA4-(czM3Ju;3hA-^JJ*pg!zf$69_g)CNWB$o zr2`KXgFO0NV}xMg!%=Xrb;aA|ik$%nSrJ{u4wS=Th?(Jh(XxbKRp3rf`AnI*%q`rX zF#t(0WxL~3PQ1}bCz=|wW(#!!z4w+QG(p-QlHFc(D}2(x3eS%Q$$4G+ED;aFfeP@$ z*Gj#96F&2mBlUU`?=UYo9fj`zeoS3r`+GbbL@(o9`PN*yc8I~!Bk&OLV~JRSK#6aF z5^`YGcnCG%Xy5@@$`0zvI?O;wCKebF_K63?ZgK`#5abG7`;HWL9LeVyX|j91TPko< zrU`Gc=c1>CFRa5_V&0ZTpTD}mfLLc<&MdD0*o9_$wd`Q3gqiMExGrBb=GzLl-!(SqEl$(kTgH&3hDP;+^NKab zXtvk^D-3nJ71AhIp4Y#9)C}(*BE#3m@$19Q3E!V-6_&=wr16o}1HFFwAy}xx%a>Q; zp@;G>BEx5tCDx?1#GJOy3iN-w+bEh)A_ELC?e!aeaEu+Y=rR8xx-P$Z6ES7;3EhreE z)HN+Y20}o9;`=bbfTBa3h_R?Bh9C@0S|}|{J6IC7h_5v<1zTh2d5h7nUg{jSSnnGlMW8kQynP=QIc~?=x6fg-MWUWQ;#yElIu#u%jLb zsK(?deqTxjBT%6PQ4t_5hez73>*8#!=Yh?E|ksiZr zIU_WDZg~Q6p;&y^B_N&(SyvktW)7(b$rgZuYHI?RN-DFY1(3to7_%lo18$nTxp+XZ zdquWP+o+m&&uWEbz7-P=oGN$%E2TlK z2uL*4)rVgsfHy;7FF~N0ENZjpaDfg z+8GQ`Uq*BYAUOdT@VI{Ww`TG3BiTvW7zzar$SYxnQMO!+jb3gv1AM%oBqn_(;7~0E z9y9{@l%vTM$P$%rB@J!wHOQnk)1>PKHhn<|8cxK663RlK#{@sdfcgb*O|dxC4bG*I zcT`oh!uEI!^Tg-1XqfCsC$YQ8yG};l2u5&5&OZdHKp0&aDXi``;HY7d@u4{KupIMa zfb<36hvfG9H;;%Y_KR@I$mI<0bsh2yZ}W!Nc0&zDy8$Y*yW%`TbZeYVp|^l&vzn?f zc@l;j&=w~vZ0+34&@Oy>fpnZBZg-nR^fJjb!DTu_S)>Rv3`wVDJyYswhHbuu1~HVt zqOw+mO7A{ZWQu}A1|iL6Y|R`XDJ&pZAt4~84*FF=gakD(Pig;C;`&CWHhl+1SY;XRM8^8G-A;p+YQ_DH>JtsJ5Kv zvg82u?4%mupoKc3>4MG`N(wJFkYRzLfEaG7RidhFS{4UxazjG1t_n{2>dkn) z#kOV8YV6QWLS}V$7?rGUIxjn0q2=UrCL_D=4!t#K>Z^%G=J{*%>mI}Q?e?6{U!L>a zw=HqWAlME02@(XVW+XuelynC+v~Yg4_C+xS;qZxth1r)#+KG}(TJc&J8Au6Vnh1EG zJHAf*helO&FBm7ZHxYSae!NBn#r!@YgWBQQQ7$N&NT*j80u~;R0F)-sK(E4(CHgb~ z-Z0HUEKcGsnzv|Aq%?g&PX_(ep)FgWL1P)fnTLQ3j29jqkS2_&mplnyzU(lhA=CWD z48`k4@la$CeCkZrF;erGTl%p;#fpnyfaNV&aus8noqrsA=4hMahJozEcs| z+LW0cOxPti1dWmn_6BJ*4f8yT4)52po@RJHZp|tyn5>?z-jq4obg3or@WvB6YwR z_ChViapgtH!mTaQX+)Ts8+&tvZwWhye4LT)Iyx8oG%v%XnO>`ghm^WfyCou{QWV#) zP@E9571y+coCOQQIZV*F{sVtGmc`@(4uAzC4-6FL?9?K{L0qZfMad0g6@?`Y$-J2& zJUr7dJbY#g51+12L7D(MmIIkU2&j+4^=)!^hhC34EFi;D4R}|)Xa=e!bpm%ICUene*3b?>J+DJVJ%2vW^S5>B?La!Q0z+7B7)~!? z0b3FVx6!q;#K7!nLBO>w1yUDa2`6PIO5k=MGtG=zAN5?4FM=?CNisx5|^wC z(ZIUA2RJFDg@a9S3nK+IOsQdq{NqJWUHu%9aB>2|F!W`Ny#X958w2QS>@e(Ar04s6 zw*#ukLIhn3gP=VprG%!wfSE509xeclHn?{!OF>Zq?ZZK0Ho^n$YrevLx(w2NQ+i2P z#}v(q<}5v6!F4j?YQH*Hb9#+v`gwjo&#bw=J5i)!Xe@dJ19J8t>MJWM@Fi|r5Gj;GQrgISPX4_DP8SyAs4PPgD^xoiWGhf~iWAi&>e;)RSSv)c-f@!F z(Al{8Ns^bJMqa6~VYj};t|>IDxKeSJ))vM~N`oHKSUA3&$l`}^8M)liVRc16 zGKvI%C|)@;NO4V!mucwQ8Nq2aM|C?SxoS-mi2@yV>5*b?Fq@#1EZCNSZD& zdMEJ&J_aZf#xOC1j~!!S+t1{->Big z37^tE&sP~iPV(aD&M=wutBI4_d)Ul;Z_m>V^V+$?-{>K%HCzFMYNHZGP3{gL>Kk%} z6*f6*D894;cSeRCEd>%ew4K-+(Ow(xV zTkU0`!tL-LH}d91DY@6&he=RTg&hme8kI95=tNYn8A{X+0XDDFqs;Xh0dh|vd-ih5X~xN$76uTeD^kQyY3#@^TU(bu3^j3QcsR4> ztK*`C0$?|Sl@8Xd0&qh0O%@IX2(luj^{%64i{c1L1SZh*^>BwKU+mi47RHI30vKd= zs`h=C3q#sV*JKG)$H7fzix6Rkq2yyp+e??lv&)(VnI3iF0YMu&vevZoC}0GG_43fd zJE)Wek%jI**f`qIiqeCe+$03QT3hDnN`gE4lDIAkovxIc5ke={rV!0oSlkUdn05-G zGVJgkF|_MvTh$Edn`}`^6YNX(6!gq%1rIW=~eBJ8J#{DL#$ zs;!9hYt1}qazEja7A$i?$>lK|JlmEfnTWdAiiUWH)N5hLJKRr-bJbRN`k)B`5*j!~ zB0>sIq)tT2w~OLFh6~9{H0aRkhkQNVp=Y9mdJare)guIO zl@h@Mvg&66T!jG6PN{{1Y#d$_xq>9i176(ZjvzrXP3?_uSZPOJ`*cYNt=zUQ&KV-p zbh*QGlrz_5)&i@SzmRkW)qKF;Ia>?T6+_5u{x)QbhZU(I5<7^YO1z*ik$4Ct;P-0y z|I8y_%D81-h9k!<0$SN7$$jG;iH!RCgN&b8&Et!^10K;wt!b3$- zfyNzg{8dmHvb4B~kqjHT!2lzSM{2IP?iQ06utRES$=jn%rh7z(A0N$ew12EC6n1!x z>dbXnA8Lj!Mshq#I3{6oCkNQV_8f0GF1u80Xd$uVMWN;t!_Jm_*#NQU%oUTX72@J4 ze^Mw!8?|F6A$gcw3=^*;^A8_5LJ51BVW|9Awsldix~s3OTPKq|%-T?piyyMcyullWaWnRajN?9sLe zI035WW+tzcGLs21Aifkvxza5=-_VdbVH*&EW^9Xl^9k!wLSk>Hd$zKvjg%3qG!b;913O z8KarsKfG6em%)&l>|S%lx%*c3Gn-{Gthcwcm>E9389>lu_y~G1i*(fsP0xlLJFOVX zV7m&_-e`!j&>~gm$kH%W2KUr-V0Ooni4cN=1=_e1o=kE!`F1xAL1TIvabo7$0cu&t z+kwrSZhn-uZYL}w3s6`T07Qs5Kvq_aTKcX&5ZxCjF4OhRc>j*q7i*0g}wq7V$I5!cpY3O}8 zQF4%ohA6a_D@PpSs=y)v9uI0Ii*SpeLJ%bxim1u?12473prJE!<^+=q#jo6%yE-hR z0M9o34gx&FgOaWze|Rb#w{<)J6oTalm26YHgxp&hg=zrz7mTfO@W;o-H8vgUJLlbYdqOPaX*7VS$+qm_E_5Hx}XY!vyX zQZZN{A*e!h6z6AvH*G^TOj=l0q;43&AUCr$NC8$!X2>IBG!ZJ{l(C}$TB17|yc&BG z8b6mijHj@)r+M0;YwZ#fi}xpM_0ckVxj3za3{ibTT|BQyWDg1LF@FIwiA}+P5SncN zJy^*1Co(k%WB>M4{lj60Gz^m`q<+{&A04K)9A6-bgM`8EJmwX-?#VxA*)=FjDu?z)5VfS| zFk8G{W`@4e@AR5bMeS9zDWplTATU^EUeU~{E~LcUV(UDZ(PzI6OU1x$Fsp5Fyx-~Q zL`c*6y`D@+8Wv3OpCOKGd$cqzE!NN5>~Py34NAvrVROOOjFv%!(9^>X+t?28w5TfT z?;?Qt`O$8d*Y>d*XlUDlwK%i$j`oq=6RuZm9R-N`jS22^MQE5@=guE0^`1}*vM(jX z3^nGE$v+E4vFK{Kl8^usyQiS_7izJ{7HJAsn0(BDSUWO-xyF?i_CCWnIg9PU3`*wA zza4gXjdwixyw6&5>vn{Kp09X|2Gq3p)G|~6;ui`Q({~eO?~>rjIU4707$ZXQpujg# zK~H1sJD6r&@)8)ca{`nU)71IO~%^}MWVSHceg@)kK| zZ+92F+E|};C zne(j#<(k~xvA3HM2!1uXLJ%>su73c^NDcc$JvBHNZ26*3jj^zzj5{>PhMJFgNKB?j zQg`TNg0*_PB|u|LCtDxOT>y+WH$bSY%nF8X$W~rsTMOq5LZVPBG);h?7EjY2{Ee zy^3XtflEi#pdgdg$`f53kV1py1X7l@rMSK!a6}_36O5W*cgZXrdU!s@XmZvX>;3Gv z{P@p|?Elv6swYj74tq=|;)YWc7OGh1Rl;2+ZU+`+*7denT)8pBaDRj6@_gLw@Er=a zrUNI{qR)G&r*Q_G-)K?@woD7ZanCSg_)OZZQSmo^gdK)*$>+9hYKE?TbBRVdl96#WN*q%Dj9J5+lFTQ9pmm)9A|Ha8*Rmi zDUBIM$>L=@?)GNjieT#wyuD|~Xy(xi3O1=f!Z!}@?W5s`(RMQS&e(oYONx8)Z#}x^ zc7H^=v)l~xW2gk$wse}I-+1_N;3q(XsFa}kI=E{;MhXhlaKbGp2`^A&u&*x&Acb`s z@(Z8?_%~Amb4_Ewh?^<<$f8x+w#||Ex3Nc>B~QWLN^}g6Ti`LTy}0Lbb9Y_enlsni z1QJ54h{pVz8jkOOa_%99MWW{^=gyl>h%F0*LZ+Gre-wz%frQz(!8>J z+bbk<{A+H9Wwy^l#?kQY$; zp#Rq+LINmhG&O5=v7k4m*g`)2h!!0?16!Lcj~BqcW}a_4{=D3^?40;aJ3 zr(9+j3d7EYbJka%N1rPM5b^LQ)$i^RA|G#FkXb>HqzE4yxZCMiU=YfE(^L?rwcOdAgxG3R9jYA*t z?WaerA*7A>Aoa9wiyl_1BbI(?To3Ld_0xvkKR(gZL*Kh2vIqNvuXT9yO zNmpRsBF2@2nDZ#e0^$e;8k^2l-=J79mlf?E;r8Xei5}Y$J314|^$TdfI01dtOjO2t zFPBTmXR{jd1d?xjj>nj+e?4@={!i+oq|`5y>j-cpBRHe%D9OUb&y(v`Rk+^VJSNKR z<|a=up3kNX?|HYWWaxZwX1bu3yAt>1RPrXmLwXK}`e#iRpKhN~VaI{gG-9Po?I~da zA%P|*wv;^rB~(sbpzv_H1KNT#gsX^wjsbu&qoMD!el@4~OUMKrY#LZV<7%>vn|Fzd zP2YHZtjl;=ygn)(eLwEywyzBHfA8q?hh@p)t^R^y9J(hiq2|r)!wb}->GV$x82GEg zAap;NS{9hZM;Y-~!w%bUXm@$8Z)}KjUltYqPcf*?9CP2Ra`2k@sRTrg+}Fa}S8Rj( z^;d2^iAoIhHa5Jo>8c4rs2DoLMd9P&uC5;PDaRCJT9vDXe~Zh*`dMLz;isA`ujh## z^b@f9tXaLLD!;oYvHVqg5`Q;|H7vLTFXt;xG{@Q+4L`#UV|D$i@!@6%bSSC);ZCD2 z^+5);s}as=aV&>q9a4(VwHMdOIhZ7jXt$o-uZ=8lzbOm#OS{|W#yH5ACw@|-&26Zc zTpl{beJ0PFjQDRYBQJ&Z2HHKht+I??Z-x*}Oly}WU(Dczl(>i%!69FgkA25S!i2p3 z`5v;f^XomoPdju(lxM^8_v_;PuuHkLcN`I-xpvdN!B|ss_7330eCfNw?9zMy);6v+ zw^y>P(RXhe^lSx^>qaqbiBJ6@sB{8fNKir(<&^C8&9x-rI||x^@WD!!q z^YFrCZ%TJdxtW`zIE-2!^N_87KHnNAw{GmDu;T*N1GvV6QPD>U5CFv|QL3|#kwXX| zYE-5CGzS+FH~#Q_PmfOU)4?xcqNN?oLoPcEtBYlRuIdk4-_84bb}WQS!8E}VA{YCV z_lup<=iE6-GpwL6XI*;HmT&~?@nfe!)T)Ln!VW{>==^-Y7mb!T4U8eagBh4I!o!&kN3)u*<)dckM^+CnB7eF4**8Tb zx)qIz9PJ;5RIoEgFZ~GJqnA`Jk-`E%GHlOioj-%fwk%183Ui$8Mw{z%xF2zPZBP5u zoBIT_t>)sY1}ehDcM$a`2&l;!XDG}MYDSqXA&TLkGgnH0s3@2{SsG)VWM>CLt__J@ zJLiUDLmUMP-w6jYj@I<*VTR#{h<-i~T@?N92K=PY;-jkA2FVAzQD{I87!#a`Asnr- zH3OY}C4-G!k_!7h(ohfAU>XN0u5H9{x$!I6$=Fp2Bvj01dDGi*Xw+EicqA=7=ZEnR z_buE1CVNmHEJO@ch)Y0jP*M=n62l%?sLW8{pRBz5seq1_20_40_100_500 +AAAGAATTCGAATTCATGTCGAAAGCTACATATAAGAATTCGAATTCGGAACGTGCTGCTACTCATCCTAGTCCTGTTGCTGCCAAGGAATTCGAATTC +CTATTTAATATCATGCACGAAAAGCAAACAAACTTGTGTGCTTCATTGGATGTTCGTACCACCAAGGAATTACTGGAGTTAGTTGAAGCATTAGGTCCC +AGAATTCGAATTCGGGTTTGTTTACTAAAAACACATGTGGATATCTTGACTGATTTTTCCATGGAGGGCACAGTTAAGCCGCTAAAGGCATTATCCGCC +AAGTACAATTTTTTACTCTTCGAAGACAGAAAATTTGCTGACATTGGTAATACAGTCAAATTGCAGTACTCTGCGGGTGTATACAGAATAGCAGAATGG +GCAGACATTACGAATGCACACGGTGTGGTGGGCCCAGGTATTGTTAGCGGTTTGAAGCAGGCGGCGGAAGAAGTAACAAAGGAACCTAGAGGCCTTTTG +ATGTTAGCAGAATTGTCATGCAAGGGCTCCCTAGCTACTGGAGAATATACTAAGGGTACTGTTGACATTGCGAAGAGCGACAAAGATTTTGTTATCGGC +TTTATTGCTCAAAGAGACATGGGTGGAAGAGATGAAGGTTACGATTGGTTGATTATGACACCCGGTGTGGGTTTAGATGACAAGGGAGACGCATTGGGT +CAACAGTATAGAACCGTGGAATTCGAATTCGATGATGTGGTCTCTACAGGATCTGACATTATTATTGTTGGAAGAGGACTATTTGCAAAGGGAAGGGAT +GCTAAGGTAGAGGGTGAACGTTACAGAAAAGCAGGCTGGGAAGCATATTTGAGAAGATGCGGCCAGCAAAACTAA diff --git a/inferelator_prior/tests/artifacts/test_motif_search.fasta.fai b/inferelator_prior/tests/artifacts/test_motif_search.fasta.fai new file mode 100644 index 0000000..bc6177b --- /dev/null +++ b/inferelator_prior/tests/artifacts/test_motif_search.fasta.fai @@ -0,0 +1 @@ +seq1_20_40_100_500 867 20 99 100 diff --git a/inferelator_prior/tests/test_motif.py b/inferelator_prior/tests/test_motif.py new file mode 100644 index 0000000..bad2920 --- /dev/null +++ b/inferelator_prior/tests/test_motif.py @@ -0,0 +1,114 @@ +import unittest +import os +import io +import pandas as pd +import numpy.testing as npt +import copy + +from inferelator_prior.motifs import meme, homer_motif + +artifact_path = os.path.join(os.path.abspath(os.path.expanduser(os.path.dirname(__file__))), "artifacts") + +MEME_FILE_NAME = "test.meme" +MOTIF_FILE_NAME = "test.motif" +PWM_FILE_NAME = "M00799_2.00.txt" +ECORI_FILE_NAME = "test_ecori.meme" + +TEST_MOTIF_MATRIX = """\ +0.248650039776609 0.26139859992769 0.241301320519092 0.248650039776609 +0.392226269785661 0.219606847798542 0.0702344472606129 0.317932435155184 +0.0257615986027584 0.0113462234969035 0.942829024734492 0.0200631531658465 +0.931183694119653 0.0181098604626899 0.00917476304082202 0.0415316823768348 +0.0209101275685474 0.0204960793014344 0.00500398009361691 0.953589813036401 +0.652610298711976 0.0667517267238459 0.0282467996828702 0.252391174881308 +0.541666052707409 0.110459581183674 0.158324966820951 0.189549399287965 +0.196060485729292 0.286696226860535 0.350171562229744 0.167071725180429 +0.286129811839987 0.225077681884186 0.309960724838339 0.178831781437488 +""" + +PWM = pd.read_csv(io.StringIO(TEST_MOTIF_MATRIX), sep="\t", index_col=None, header=None).values + + +class TestMotifParsers(unittest.TestCase): + + def test_meme_loader(self): + + meme_file_name = os.path.join(artifact_path, MEME_FILE_NAME) + motifs = meme.read(meme_file_name) + + with open(meme_file_name) as meme_fh: + motifs2 = meme.read(meme_fh) + + self.assertEqual(len(motifs), 1) + npt.assert_array_almost_equal(motifs[0].probability_matrix, PWM, 4) + npt.assert_array_almost_equal(motifs2[0].probability_matrix, PWM, 4) + + self.assertListEqual(motifs[0].alphabet, list("ACGT")) + self.assertEqual(motifs[0].alphabet_len, 4) + self.assertAlmostEqual(motifs[0].information_content, 6.082, 3) + + def test_homer_motif_loader(self): + + motif_file_name = os.path.join(artifact_path, MOTIF_FILE_NAME) + motifs = homer_motif.read(motif_file_name) + + with open(motif_file_name) as meme_fh: + motifs2 = homer_motif.read(meme_fh) + + self.assertEqual(len(motifs), 1) + npt.assert_array_almost_equal(motifs[0].probability_matrix, PWM, 4) + npt.assert_array_almost_equal(motifs2[0].probability_matrix, PWM, 4) + + self.assertListEqual(motifs[0].alphabet, list("ACGT")) + self.assertEqual(motifs[0].alphabet_len, 4) + self.assertAlmostEqual(motifs[0].information_content, 6.082, 3) + + def test_pwm_loader(self): + + motif_file_name = os.path.join(artifact_path, MOTIF_FILE_NAME) + motifs = homer_motif.read(motif_file_name) + + with open(motif_file_name) as meme_fh: + motifs2 = homer_motif.read(meme_fh) + + self.assertEqual(len(motifs), 1) + npt.assert_array_almost_equal(motifs[0].probability_matrix, PWM, 4) + npt.assert_array_almost_equal(motifs2[0].probability_matrix, PWM, 4) + + self.assertListEqual(motifs[0].alphabet, list("ACGT")) + self.assertEqual(motifs[0].alphabet_len, 4) + self.assertAlmostEqual(motifs[0].information_content, 6.082, 3) + + +class TestMotifProps(unittest.TestCase): + + @classmethod + def setUpClass(cls): + + meme_file_name = os.path.join(artifact_path, ECORI_FILE_NAME) + cls.master_motif = meme.read(meme_file_name)[0] + + def setUp(self): + + self.motif = copy.deepcopy(self.master_motif) + + def test_information_content(self): + + self.assertEqual(self.motif.information_content, 24.0) + + def test_consensus(self): + + self.assertEqual(self.motif.consensus, "GAATTCGAATTC") + + def test_length(self): + + self.assertEqual(len(self.motif), 12) + + def test_score(self): + + self.assertEqual(self.motif.score_match("GAATTCGAATTC"), 24.0) + self.assertEqual(self.motif.score_match("GAATTCGAATAC"), 18.0) + self.assertEqual(self.motif.score_match("GAATTCGAATAA"), 12.0) + self.assertEqual(self.motif.score_match("GAATTCGATACG"), 0.0) + self.assertEqual(self.motif.score_match("GAATTCCTTAAG"), 0.0) + self.assertEqual(self.motif.score_match("CTTAAGCTTAAG"), 0.0) diff --git a/inferelator_prior/tests/test_motif_scan.py b/inferelator_prior/tests/test_motif_scan.py new file mode 100644 index 0000000..ed86977 --- /dev/null +++ b/inferelator_prior/tests/test_motif_scan.py @@ -0,0 +1,88 @@ +import unittest +import os +import io +import pandas as pd +import numpy as np +import numpy.testing as npt + +from inferelator_prior.motifs._motif import __MotifScanner as MotifScanner +from inferelator_prior.motifs import Motif, fimo, homer, SCAN_SCORE_COL + +artifact_path = os.path.join(os.path.abspath(os.path.expanduser(os.path.dirname(__file__))), "artifacts") + +FASTA_FILE_NAME = os.path.join(artifact_path, "test_motif_search.fasta") +BED_FILE_NAME = os.path.join(artifact_path, "test_motif_search.bed") + + +TEST_MOTIF_MATRIX = """\ +0.0\t0.0\t1.0\t0.0 +1.0\t0.0\t0.0\t0.0 +1.0\t0.0\t0.0\t0.0 +0.0\t0.0\t0.0\t1.0 +0.0\t0.0\t0.0\t1.0 +0.0\t1.0\t0.0\t0.0 +0.0\t0.0\t1.0\t0.0 +1.0\t0.0\t0.0\t0.0 +1.0\t0.0\t0.0\t0.0 +0.0\t0.0\t0.0\t1.0 +0.0\t0.0\t0.0\t1.0 +0.0\t1.0\t0.0\t0.0 +""" + +MOTIF_OBJ = Motif("EcoRI", "EcoRI", list("ACGT")) +MOTIF_OBJ.probability_matrix = pd.read_csv(io.StringIO(TEST_MOTIF_MATRIX), sep="\t", header=None, index_col=None)\ + .astype(float)\ + .values + +MOTIF_STARTS = [3, 35, 87, 199, 711] + + +class TestScan(unittest.TestCase): + + def test_base(self): + + scanner = MotifScanner(motifs=[MOTIF_OBJ], num_workers=1) + + with self.assertRaises(NotImplementedError): + scanner.scan(BED_FILE_NAME, FASTA_FILE_NAME, min_ic=8) + + with self.assertRaises(NotImplementedError): + scanner._preprocess(8) + + with self.assertRaises(NotImplementedError): + scanner._parse_output(None) + + with self.assertRaises(NotImplementedError): + scanner._get_motifs(None, None) + + def test_fimo(self): + scanner = fimo.FIMOScanner(motifs=[MOTIF_OBJ], num_workers=1) + motif_locs = scanner.scan(BED_FILE_NAME, FASTA_FILE_NAME, min_ic=8) + + self.assertEqual(motif_locs.shape[0], 10) + self.assertEqual(motif_locs.loc[motif_locs[fimo.FIMO_STRAND] == "+", :].shape[0], 5) + self.assertEqual(motif_locs.loc[motif_locs[fimo.FIMO_STRAND] == "-", :].shape[0], 5) + + self.assertListEqual(motif_locs.loc[motif_locs[fimo.FIMO_STRAND] == "+", fimo.FIMO_START].tolist(), + MOTIF_STARTS) + self.assertListEqual(motif_locs.loc[motif_locs[fimo.FIMO_STRAND] == "-", fimo.FIMO_START].tolist(), + MOTIF_STARTS) + + npt.assert_array_almost_equal(np.array([24.0] * 10), motif_locs[SCAN_SCORE_COL].values) + + def test_homer(self): + scanner = homer.HOMERScanner(motifs=[MOTIF_OBJ], num_workers=1) + motif_locs = scanner.scan(BED_FILE_NAME, FASTA_FILE_NAME, min_ic=8) + + self.assertEqual(motif_locs.shape[0], 10) + self.assertEqual(motif_locs.loc[motif_locs[homer.HOMER_STRAND] == "+", :].shape[0], 5) + self.assertEqual(motif_locs.loc[motif_locs[homer.HOMER_STRAND] == "-", :].shape[0], 5) + + self.assertListEqual(motif_locs.loc[motif_locs[homer.HOMER_STRAND] == "+", homer.HOMER_START] + .tolist(), + MOTIF_STARTS) + self.assertListEqual(motif_locs.loc[motif_locs[homer.HOMER_STRAND] == "-", homer.HOMER_START].sort_values() + .tolist(), + MOTIF_STARTS) + + npt.assert_array_almost_equal(np.array([24.0] * 10), motif_locs[SCAN_SCORE_COL].values) diff --git a/inferelator_prior/tests/test_srr.py b/inferelator_prior/tests/test_srr.py new file mode 100644 index 0000000..8a1cdee --- /dev/null +++ b/inferelator_prior/tests/test_srr.py @@ -0,0 +1,50 @@ +import tempfile +import shutil +import os +import unittest + +from inferelator_prior.processor.srr import get_srr_files, unpack_srr_files + +TEST_SRR_IDS = ["SRR053325"] + + +class TestSRR(unittest.TestCase): + + srr_ids = TEST_SRR_IDS + temp_path = None + + @classmethod + def setUpClass(cls): + cls.temp_path = tempfile.mkdtemp() + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.temp_path, ignore_errors=True) + + def test_srr_get_success(self): + srr_files = get_srr_files(self.srr_ids, self.temp_path, prefetch_options=["--transport", "http"]) + self.assertTrue(os.path.exists(srr_files[0])) + self.assertEqual(os.path.getsize(srr_files[0]), 31838) + + srr_files_2 = get_srr_files(self.srr_ids, self.temp_path) + self.assertEqual(srr_files[0], srr_files_2[0]) + + def test_srr_get_fail(self): + self.assertIsNone(get_srr_files([""], os.path.join(self.temp_path, "blah", "blah"), + prefetch_options=["--transport", "http"])[0]) + + def test_srr_unpack_success(self): + srr_files = get_srr_files(self.srr_ids, self.temp_path, prefetch_options=["--transport", "http"]) + fastq_files = unpack_srr_files(self.srr_ids, srr_files, self.temp_path) + self.assertTrue(all(map(lambda x: os.path.exists(x), fastq_files[0]))) + self.assertEqual(len(fastq_files[0]), 3) + + fastq_files2 = unpack_srr_files(self.srr_ids, srr_files, self.temp_path) + self.assertListEqual(fastq_files[0], fastq_files2[0]) + + def test_srr_unpack_fail(self): + self.assertListEqual(unpack_srr_files(self.srr_ids, [""], self.temp_path)[0], [None]) + + def test_srr_unpack_skip(self): + self.assertListEqual(unpack_srr_files(self.srr_ids, [None], self.temp_path)[0], [None]) + diff --git a/inferelator_prior/tests/test_star.py b/inferelator_prior/tests/test_star.py new file mode 100644 index 0000000..1ce873b --- /dev/null +++ b/inferelator_prior/tests/test_star.py @@ -0,0 +1,72 @@ +import os +import tempfile +import unittest +import shutil + +import pysam + +from inferelator_prior.processor.star import star_mkref, star_align_fastqs + +GENOME_FILE_NAME = "sc64_mito.fasta" +ANNOTATION_FILE_NAME = "sc64_mito.gtf" +TEST_FASTQ = "test_alignment.fastq" + +STAR_REF_FILES = ["chrLength.txt", "chrName.txt", "exonGeTrInfo.tab", "geneInfo.tab", "genomeParameters.txt", "SA", + "sjdbInfo.txt", "sjdbList.out.tab", "chrNameLength.txt", "chrStart.txt", "exonInfo.tab", "Genome", + "SAindex", "sjdbList.fromGTF.out.tab", "transcriptInfo.tab"] + +artifact_path = os.path.join(os.path.abspath(os.path.expanduser(os.path.dirname(__file__))), "artifacts") + + +class TestSTAR(unittest.TestCase): + genome_file = os.path.join(artifact_path, GENOME_FILE_NAME) + annotation_file = os.path.join(artifact_path, ANNOTATION_FILE_NAME) + fastq_file = os.path.join(artifact_path, TEST_FASTQ) + + temp_path = None + star_ref_path = None + sam_out_path = None + + @classmethod + def setUpClass(cls): + cls.temp_path = tempfile.mkdtemp() + cls.star_ref_path = os.path.join(cls.temp_path, "star") + cls.sam_out_path = os.path.join(cls.temp_path, "sam") + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.temp_path, ignore_errors=True) + + def test_star(self): + self._step_mkref() + sam_files = self._step_align() + self._step_test_alignment(sam_files) + + def _step_mkref(self): + out_path = star_mkref(self.star_ref_path, genome_file=[self.genome_file], annotation_file=self.annotation_file, + move_files=False) + + self.assertFalse(out_path is None) + self.assertTrue(out_path == self.star_ref_path) + + for file_name in STAR_REF_FILES: + self.assertTrue(os.path.exists(os.path.join(out_path, file_name))) + + def _step_align(self): + sam_files = star_align_fastqs(["TEST"], [[self.fastq_file]], self.star_ref_path, self.sam_out_path) + + self.assertFalse(sam_files[0] is None) + + return sam_files + + def _step_test_alignment(self, sam_files): + samfile = pysam.AlignmentFile(sam_files[0], "r") + reads = [aln for aln in samfile.fetch()] + + self.assertEqual(len(reads), 9) + self.assertEqual(sum(map(lambda x: x.is_reverse, reads)), 3) + + self.assertListEqual(list(map(lambda x: x.reference_start, reads)), + [28620, 30480, 3078, 4893, 5465, 5545, 37969, 37969, 38033]) + self.assertListEqual(list(map(lambda x: x.reference_end, reads)), + [28719, 30579, 3177, 4992, 5564, 5644, 38068, 38068, 38132]) diff --git a/inferelator_prior/tests/test_utils.py b/inferelator_prior/tests/test_utils.py new file mode 100644 index 0000000..6ccc597 --- /dev/null +++ b/inferelator_prior/tests/test_utils.py @@ -0,0 +1,55 @@ +import os +import shutil +import tempfile +import unittest +from urllib.error import URLError + +import inferelator_prior.processor.utils as utils + + +class TestUtils(unittest.TestCase): + temp_path = None + + @classmethod + def setUpClass(cls): + cls.temp_path = tempfile.mkdtemp() + cls.star_ref_path = os.path.join(cls.temp_path, "star") + cls.sam_out_path = os.path.join(cls.temp_path, "sam") + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.temp_path, ignore_errors=True) + + def test_ncbi_genomes(self): + fasta, gtf = utils.get_genome_file_locs("sc64") + self.assertEqual(gtf[1], "sc64.gtf.gz") + + with self.assertRaises(ValueError): + utils.get_genome_file_locs("not_a_real_thing") + + def test_get_file_from_url(self): + fasta, gtf = utils.get_genome_file_locs("sc64") + target_path = os.path.join(self.temp_path, "test.gtf.gz") + file_path = utils.get_file_from_url(gtf[0], target_path) + self.assertEqual(file_path, target_path) + self.assertTrue(os.path.exists(file_path)) + + with self.assertRaises(URLError): + file_path = utils.get_file_from_url(gtf[0] + "does_not_exist.file", target_path) + + def test_requirements(self): + self.assertTrue(utils.test_requirements_exist(test_targets=["python"], + test_package={"python": ("python", ["python", "--version"])}, + test_htseq=False, test_chroma=False)) + + self.assertTrue(utils.test_requirements_exist(test_targets=["python"], + test_package={"python": ("python", ["python", "--version"])}, + test_htseq=True, test_chroma=True)) + + with self.assertRaises(FileNotFoundError): + utils.test_requirements_exist(test_targets=["not_a-.thing"], + test_package={"not_a-.thing": ("fake", ["not_a-.thing", "--version"])}, + test_htseq=False, test_chroma=False) + + def test_file_path_abs(self): + self.assertEqual(os.path.abspath(os.path.expanduser("~")), utils.file_path_abs("~")) diff --git a/inferelator_prior/velocity/__init__.py b/inferelator_prior/velocity/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inferelator_prior/velocity/calc.py b/inferelator_prior/velocity/calc.py new file mode 100644 index 0000000..2287fb6 --- /dev/null +++ b/inferelator_prior/velocity/calc.py @@ -0,0 +1,79 @@ +import numpy as _np +from scipy.sparse import issparse as _is_sparse + + +def calc_velocity(expr, time_axis, neighbor_graph, n_neighbors): + """ + Calculate local RNA velocity + + :param expr: Samples x Genes numpy with expression data + :param time_axis: Samples, numpy array + :param neighbor_graph: Samples x Samples numpy or scipy.sparse with nearest neighbor distances + :param n_neighbors: Number of neighbors to use, int + :return: Samples x Genes numpy with velocity data + """ + + n_gen = _find_local(expr, neighbor_graph, n_neighbors) + return _np.vstack([_calc_local_velocity(expr[n_idx, :].copy(), + time_axis[n_idx].copy(), + (n_idx == i).nonzero()[0][0]) + for i, n_idx in n_gen]) + + +def _calc_local_velocity(expr, time_axis, center_index): + """ + Calculate a local rate of change + + :param expr: Samples x Genes numpy with expression data + :param time_axis: Samples, numpy array + :param center_index: The data point which we're calculating velocity for + :return: + """ + + n, m = expr.shape + + # Calculate change in time relative to the centerpoint + if time_axis[center_index] > 90: + time_axis[time_axis < 30] = time_axis[time_axis < 30] + 120 + elif time_axis[center_index] < 30: + time_axis[time_axis > 90] = time_axis[time_axis > 90] - 120 + + time_axis = (time_axis - time_axis[center_index]).reshape(-1, 1) + + # Calculate (XT * X)^-1 * X + x_for_hat = _np.dot(_np.linalg.inv(_np.dot(time_axis.T, time_axis)), time_axis.T) + + # Calculate change in expression relative to the centerpoint + y_diff = _np.subtract(expr, expr[center_index, :]) + + # Return the slope for each gene as velocity + return _np.array([_np.dot(x_for_hat, y_diff[:, i])[0] for i in range(m)]) + + +def _find_local(expr, neighbor_graph, n_neighbors): + """ + Find a return an expression matrix for a locally connected graph + + :param expr: Samples x Genes numpy or scipy with expression data + :param neighbor_graph: Samples x Samples numpy or scipy with neighbor distances as 1/dist. + :param n_neighbors: + :return: + """ + + n, m = expr.shape + neighbor_sparse = _is_sparse(neighbor_graph) + + for i in range(n): + n_slice = neighbor_graph[i, :] + if neighbor_sparse: + if n_slice.data.shape[0] > n_neighbors: + keepers = n_slice.indices[_np.argsort(n_slice.data)[-1 * n_neighbors:]] + else: + keepers = n_slice.indices + else: + keepers = _np.argsort(n_slice)[-1 * n_neighbors:] + + if i % 100 == 0: + print("Extracted {n} neighbors for sample {i} / {t}".format(n=len(keepers), i=i, t=n)) + + yield i, _np.insert(keepers, 0, i) From 6d96350903cbc9e44521b6ae3d50e6064d69bdbb Mon Sep 17 00:00:00 2001 From: asistradition Date: Mon, 28 Sep 2020 15:24:46 -0400 Subject: [PATCH 14/20] DBSCAN for sparsity --- inferelator_prior/network_from_motifs.py | 30 +- .../processor/_species_constants.py | 34 + inferelator_prior/processor/prior.py | 75 +- srrTomat0/ATACBedTomat0.py | 139 -- srrTomat0/ATACTomat0.py | 87 - srrTomat0/ChIPBedTomat0.py | 145 -- srrTomat0/PWMToMEME.py | 34 - srrTomat0/RNASeqTomat0.py | 134 -- srrTomat0/__init__.py | 25 - srrTomat0/motifs/__init__.py | 4 - srrTomat0/motifs/_motif.py | 384 ----- srrTomat0/motifs/fimo.py | 68 - srrTomat0/motifs/homer.py | 70 - srrTomat0/motifs/homer_motif.py | 71 - srrTomat0/motifs/meme.py | 179 --- srrTomat0/motifs/motif_scan.py | 49 - srrTomat0/motifs/pwm.py | 46 - srrTomat0/motifs/transfac.py | 90 -- srrTomat0/processor/__init__.py | 0 srrTomat0/processor/bedtools.py | 123 -- srrTomat0/processor/chroma.py | 13 - srrTomat0/processor/gtf.py | 144 -- srrTomat0/processor/htseq_count.py | 88 - srrTomat0/processor/matrix.py | 176 -- srrTomat0/processor/prior.py | 364 ----- srrTomat0/processor/samtools.py | 83 - srrTomat0/processor/srr.py | 173 -- srrTomat0/processor/star.py | 218 --- srrTomat0/processor/utils.py | 142 -- srrTomat0/srrTomat0_mkref.py | 32 - srrTomat0/tests/__init__.py | 0 srrTomat0/tests/artifacts/M00799_2.00.txt | 10 - srrTomat0/tests/artifacts/sc64_mito.fasta | 1431 ----------------- srrTomat0/tests/artifacts/sc64_mito.gtf | 317 ---- srrTomat0/tests/artifacts/test.meme | 23 - srrTomat0/tests/artifacts/test.motif | 10 - .../tests/artifacts/test_alignment.fastq.gz | Bin 9201 -> 0 bytes srrTomat0/tests/artifacts/test_ecori.meme | 26 - .../tests/artifacts/test_motif_search.bed | 2 - .../tests/artifacts/test_motif_search.fasta | 10 - srrTomat0/tests/test_motif.py | 111 -- srrTomat0/tests/test_motif_scan.py | 88 - srrTomat0/tests/test_srr.py | 50 - srrTomat0/tests/test_star.py | 72 - srrTomat0/tests/test_utils.py | 55 - 45 files changed, 69 insertions(+), 5356 deletions(-) create mode 100644 inferelator_prior/processor/_species_constants.py delete mode 100644 srrTomat0/ATACBedTomat0.py delete mode 100644 srrTomat0/ATACTomat0.py delete mode 100755 srrTomat0/ChIPBedTomat0.py delete mode 100644 srrTomat0/PWMToMEME.py delete mode 100644 srrTomat0/RNASeqTomat0.py delete mode 100644 srrTomat0/__init__.py delete mode 100644 srrTomat0/motifs/__init__.py delete mode 100644 srrTomat0/motifs/_motif.py delete mode 100644 srrTomat0/motifs/fimo.py delete mode 100644 srrTomat0/motifs/homer.py delete mode 100644 srrTomat0/motifs/homer_motif.py delete mode 100644 srrTomat0/motifs/meme.py delete mode 100644 srrTomat0/motifs/motif_scan.py delete mode 100644 srrTomat0/motifs/pwm.py delete mode 100644 srrTomat0/motifs/transfac.py delete mode 100644 srrTomat0/processor/__init__.py delete mode 100644 srrTomat0/processor/bedtools.py delete mode 100644 srrTomat0/processor/chroma.py delete mode 100644 srrTomat0/processor/gtf.py delete mode 100644 srrTomat0/processor/htseq_count.py delete mode 100644 srrTomat0/processor/matrix.py delete mode 100644 srrTomat0/processor/prior.py delete mode 100644 srrTomat0/processor/samtools.py delete mode 100644 srrTomat0/processor/srr.py delete mode 100644 srrTomat0/processor/star.py delete mode 100644 srrTomat0/processor/utils.py delete mode 100644 srrTomat0/srrTomat0_mkref.py delete mode 100644 srrTomat0/tests/__init__.py delete mode 100644 srrTomat0/tests/artifacts/M00799_2.00.txt delete mode 100644 srrTomat0/tests/artifacts/sc64_mito.fasta delete mode 100644 srrTomat0/tests/artifacts/sc64_mito.gtf delete mode 100644 srrTomat0/tests/artifacts/test.meme delete mode 100644 srrTomat0/tests/artifacts/test.motif delete mode 100644 srrTomat0/tests/artifacts/test_alignment.fastq.gz delete mode 100644 srrTomat0/tests/artifacts/test_ecori.meme delete mode 100644 srrTomat0/tests/artifacts/test_motif_search.bed delete mode 100644 srrTomat0/tests/artifacts/test_motif_search.fasta delete mode 100644 srrTomat0/tests/test_motif.py delete mode 100644 srrTomat0/tests/test_motif_scan.py delete mode 100644 srrTomat0/tests/test_srr.py delete mode 100644 srrTomat0/tests/test_star.py delete mode 100644 srrTomat0/tests/test_utils.py diff --git a/inferelator_prior/network_from_motifs.py b/inferelator_prior/network_from_motifs.py index 79bf42d..a5d4b6d 100644 --- a/inferelator_prior/network_from_motifs.py +++ b/inferelator_prior/network_from_motifs.py @@ -1,7 +1,9 @@ -from inferelator_prior.processor.gtf import load_gtf_to_dataframe, open_window, GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND +from inferelator_prior.processor.gtf import (load_gtf_to_dataframe, open_window, GTF_CHROMOSOME, + SEQ_START, SEQ_STOP, GTF_STRAND) from inferelator_prior.processor.prior import build_prior_from_atac_motifs, MotifScorer from inferelator_prior.motifs.motif_scan import MotifScan from inferelator_prior.motifs import motifs_to_dataframe, INFO_COL, MOTIF_NAME_COL +from inferelator_prior.processor._species_constants import SPECIES_MAP import argparse import os @@ -28,6 +30,8 @@ def main(): ap.add_argument("--tandem_window", dest="tandem", help="Bases between TF bindings to consider an array", metavar="BASES", type=int, default=100) ap.add_argument("--threshold", nargs="+", default=None, type=str) + ap.add_argument("--species", dest="species", help="Load settings for a target species. Overrides other settings", + default=None, type=str, choices=list(SPECIES_MAP.keys()) + [None]) args = ap.parse_args() out_prefix = os.path.abspath(os.path.expanduser(args.out)) @@ -35,16 +39,29 @@ def main(): if not os.path.exists(out_path): os.makedirs(out_prefix) + _species = args.species.lower() if args.species is not None else None + + if _species is None: + _window = args.window_size + _tandem = args.tandem + _use_tss = args.tss + else: + _window = SPECIES_MAP[_species]['window'] + _tandem = SPECIES_MAP[_species]['tandem'] + _use_tss = SPECIES_MAP[_species]['use_tss'] + if args.threshold is None: prior_edges, prior_matrix, raw_matrix = build_atac_motif_prior(args.motif, args.atac, args.annotation, args.fasta, - window_size=args.window_size, + window_size=_window, num_cores=args.cores, - use_tss=args.tss, motif_ic=args.min_ic, + use_tss=_use_tss, + motif_ic=args.min_ic, + tandem=_tandem, scanner_type=args.scanner, motif_format=args.motif_format) - prior_matrix.astype(int).to_csv(out_prefix + "_edge_matrix.tsv.gz", sep="\t") + (prior_matrix != 0).astype(int).to_csv(out_prefix + "_edge_matrix.tsv.gz", sep="\t") prior_edges.to_csv(out_prefix + "_edge_table.tsv.gz", sep="\t") raw_matrix.to_csv(out_prefix + "_unfiltered_matrix.tsv.gz", sep="\t") else: @@ -56,11 +73,12 @@ def main(): for t in args.threshold: prior_edges, prior_matrix, raw_matrix = build_atac_motif_prior(args.motif, args.atac, args.annotation, args.fasta, - window_size=args.window_size, + window_size=_window, num_cores=args.cores, - use_tss=args.tss, motif_ic=args.min_ic, + use_tss=_use_tss, motif_ic=args.min_ic, scanner_type=args.scanner, scanner_thresh=t, + tandem=_tandem, motif_format=args.motif_format) edge_count[t] = (raw_matrix != 0).sum(axis=0) diff --git a/inferelator_prior/processor/_species_constants.py b/inferelator_prior/processor/_species_constants.py new file mode 100644 index 0000000..cf734c2 --- /dev/null +++ b/inferelator_prior/processor/_species_constants.py @@ -0,0 +1,34 @@ +_YEAST = { + "use_tss": True, + "window": (1000, 100), + "tandem": 25 +} + +_FLY = { + "use_tss": True, + "window": (50000, 2000), + "tandem": 100 +} + +_MOUSE = { + "use_tss": True, + "window": (50000, 2000), + "tandem": 100 +} + +_HUMAN = { + "use_tss": True, + "window": (50000, 2000), + "tandem": 100 +} + +SPECIES_MAP = { + "yeast": _YEAST, + "saccharomyces cerevisiae": _YEAST, + "fly": _FLY, + "drosophila melanogaster": _FLY, + "mouse": _MOUSE, + "mus musculus": _MOUSE, + "human": _HUMAN, + "homo sapiens": _HUMAN +} \ No newline at end of file diff --git a/inferelator_prior/processor/prior.py b/inferelator_prior/processor/prior.py index 334ad1d..c143ba4 100644 --- a/inferelator_prior/processor/prior.py +++ b/inferelator_prior/processor/prior.py @@ -6,7 +6,7 @@ import pandas.api.types as pat import numpy as np import pathos.multiprocessing as multiprocessing -from sklearn.cluster import DBSCAN +from sklearn.cluster import DBSCAN, OPTICS from sklearn.neighbors import LocalOutlierFactor from sklearn.covariance import EllipticEnvelope from scipy.ndimage.filters import uniform_filter1d @@ -233,9 +233,9 @@ def _prior_mapper(data): raw_matrix.index.name = PRIOR_GENE prior_matrix = raw_matrix.copy() - # Threshold using DBSCAN outlier detection + # Threshold per-TF using DBSCAN for reg in prior_matrix.columns: - prior_matrix.loc[~_find_outliers_lof(prior_matrix[reg]), reg] = 0. + prior_matrix.loc[~_find_outliers_dbscan(prior_matrix[reg]), reg] = 0. # Keep the peaks that we want thresholded_data = prior_matrix.reset_index().melt(id_vars=PRIOR_GENE, var_name=PRIOR_TF, value_name='T') @@ -260,75 +260,22 @@ def _gene_gen(genes, motif_peaks): continue -def _find_outliers_dbscan(tf_data, t_1=0.01, t_2=0.05): +def _find_outliers_dbscan(tf_data, max_sparsity=0.05): scores = tf_data.values.reshape(-1, 1) - counts = tf_data.shape[0] - labels = DBSCAN(min_samples=np.log2(counts), eps=scores.max() / 100).fit_predict(scores) + labels = DBSCAN(min_samples=max(int(scores.size * 0.001), 10), eps=1, algorithm='brute').fit_predict(scores) # Keep any outliers (outliers near 0 should be discarded) keep_edge = pd.Series((labels == -1) & (tf_data.values > np.mean(scores)), index=tf_data.index) - # Iterate through clusters in reverse order until at least t_1 and no more than t_2 edges are included - for lab in np.unique(labels)[::-1]: - current_ratio = keep_edge.sum() / keep_edge.size - new_labels = labels == lab - if current_ratio > t_1: - break - elif current_ratio + (new_labels.sum() / new_labels.size) > t_2: - break - else: - keep_edge |= new_labels - - return keep_edge - - -def _find_outliers_elliptic_envelope(tf_data, outlier=2.5, skip_threshold=0.002): - - scores = tf_data.values - keep_genes = pd.Series(False, index=tf_data.index) - - if np.var(scores) == 0.: - return keep_genes - - _nz_idx = scores > 0 - - if skip_threshold is not None and np.sum(_nz_idx) < (len(keep_genes) * skip_threshold): - return keep_genes | _nz_idx + # Add the cluster of values with the largest scores unless that exceeds max_sparsity + current_ratio = keep_edge.sum() / keep_edge.size + new_labels = labels == np.unique(labels)[-1] - # Calculate Mahalanobis distance - _nzs = scores[_nz_idx].reshape(-1, 1) + if current_ratio + (new_labels.sum() / new_labels.size) <= max_sparsity: + keep_edge |= new_labels - if np.var(_nzs) == 0.: - return keep_genes - - # Correct for just the dumbest bug in scipy - if (np.mean(_nzs) == np.median(_nzs)) & (np.sum(_nzs == np.median(_nzs)) > (0.5 * _nzs.size)): - keep_genes[_nz_idx] = (_nzs >= np.median(_nzs)).flatten() - return keep_genes - - m_dist = EllipticEnvelope(support_fraction=1).fit(_nzs).score_samples(_nzs) - scaled_m_dist = (m_dist - np.mean(m_dist)) / np.std(m_dist) - keep_genes[_nz_idx] = scaled_m_dist < (-1 * outlier) - - return keep_genes - - -def _find_outliers_lof(tf_data, skip_threshold=0.002): - scores = tf_data[tf_data > 0] - keep_genes = pd.Series(False, index=tf_data.index) - - if np.var(scores) == 0.: - return keep_genes - - if skip_threshold is not None and scores.size < (tf_data.size * skip_threshold): - keep_genes[scores.index] = True - return keep_genes - - labels = LocalOutlierFactor(n_neighbors=max(int(scores.size * 0.01), 20), - algorithm='brute', p=1).fit_predict(scores.values.reshape(-1, 1)) - keep_genes[scores.index] = (labels == -1) & (scores.values > np.mean(scores)) - return keep_genes + return keep_edge def _build_prior_for_gene(gene_info, motif_data, motif_information, num_iteration): diff --git a/srrTomat0/ATACBedTomat0.py b/srrTomat0/ATACBedTomat0.py deleted file mode 100644 index 4213991..0000000 --- a/srrTomat0/ATACBedTomat0.py +++ /dev/null @@ -1,139 +0,0 @@ -from srrTomat0.processor.gtf import load_gtf_to_dataframe, open_window, GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND -from srrTomat0.processor.prior import build_prior_from_atac_motifs, MotifScorer -from srrTomat0.motifs.motif_scan import MotifScan -from srrTomat0.motifs import motifs_to_dataframe, INFO_COL, MOTIF_NAME_COL - -import argparse -import os -import pathlib -import pandas as pd - - -def main(): - ap = argparse.ArgumentParser(description="Create a prior from open chromatin peaks and motif peaks") - ap.add_argument("-m", "--motif", dest="motif", help="Motif file", metavar="PATH", required=True) - ap.add_argument("--motif_format", dest="motif_format", help="Motif file FORMAT (transfac or meme)", - metavar="FORMAT", default="meme") - ap.add_argument("-a", "--atac", dest="atac", help="ATAC BED file", metavar="FILE", default=None) - ap.add_argument("-f", "--fasta", dest="fasta", help="Genomic FASTA file", metavar="FILE", required=True) - ap.add_argument("-g", "--gtf", dest="annotation", help="GTF Annotation File", metavar="PATH", required=True) - ap.add_argument("-o", "--out", dest="out", help="Output PATH prefix", metavar="PATH", required=True) - ap.add_argument("-w", "--window", dest="window_size", help="Window around genes", type=int, default=0, nargs="+") - ap.add_argument("-c", "--cpu", dest="cores", help="Number of cores", metavar="CORES", type=int, default=1) - ap.add_argument("--tss", dest="tss", help="Use TSS for window", action='store_const', const=True, default=False) - ap.add_argument("--scan", dest="scanner", help="FIMO or HOMER", type=str, default='fimo') - ap.add_argument("--motif_preprocessing_ic", dest="min_ic", help="Minimum information content", - metavar="BITS", type=int, default=None) - ap.add_argument("--tandem_window", dest="tandem", help="Bases between TF bindings to consider an array", - metavar="BASES", type=int, default=100) - ap.add_argument("--threshold", nargs="+", default=None, type=str) - - args = ap.parse_args() - out_prefix = os.path.abspath(os.path.expanduser(args.out)) - out_path = os.path.join(*pathlib.PurePath(out_prefix).parts[:-1]) - if not os.path.exists(out_path): - os.makedirs(out_prefix) - - if args.threshold is None: - prior_edges, prior_matrix, raw_matrix = build_atac_motif_prior(args.motif, args.atac, args.annotation, - args.fasta, - window_size=args.window_size, - num_cores=args.cores, - use_tss=args.tss, motif_ic=args.min_ic, - scanner_type=args.scanner, - motif_format=args.motif_format) - - prior_matrix.astype(int).to_csv(out_prefix + "_edge_matrix.tsv.gz", sep="\t") - prior_edges.to_csv(out_prefix + "_edge_table.tsv.gz", sep="\t") - raw_matrix.to_csv(out_prefix + "_unfiltered_matrix.tsv.gz", sep="\t") - else: - motifs = MotifScan.load_motif_file(args.motif) - motif_information = motifs_to_dataframe(motifs) - motif_information = motif_information[[MOTIF_NAME_COL, INFO_COL]].groupby(MOTIF_NAME_COL).agg("max") - - edge_count = {} - for t in args.threshold: - prior_edges, prior_matrix, raw_matrix = build_atac_motif_prior(args.motif, args.atac, args.annotation, - args.fasta, - window_size=args.window_size, - num_cores=args.cores, - use_tss=args.tss, motif_ic=args.min_ic, - scanner_type=args.scanner, - scanner_thresh=t, - motif_format=args.motif_format) - - edge_count[t] = (raw_matrix != 0).sum(axis=0) - - edge_count = pd.concat(edge_count, axis=1) - edge_count = edge_count.join(motif_information[INFO_COL]) - - edge_count.to_csv(out_prefix + "_edge_count.tsv", sep="\t") - - -def build_atac_motif_prior(motif_file, atac_bed_file, annotation_file, genomic_fasta_file, window_size=0, - use_tss=True, scanner_type='fimo', num_cores=1, motif_ic=6, tandem=100, - truncate_motifs=0.35, scanner_thresh="1e-4", motif_format="meme"): - # Set the scanner type - if scanner_type.lower() == 'fimo': - MotifScan.set_type_fimo() - elif scanner_type.lower() == 'homer': - MotifScan.set_type_homer() - else: - raise ValueError("motif_type must be fimo or homer") - - # PROCESS GENE ANNOTATIONS # - - print("Loading genes from file ({f})".format(f=annotation_file)) - # Load genes and open a window - genes = load_gtf_to_dataframe(annotation_file) - print("\t{n} genes loaded".format(n=genes.shape[0])) - - genes = open_window(genes, window_size=window_size, use_tss=use_tss, check_against_fasta=genomic_fasta_file) - print("\tPromoter regions defined with window {w}".format(w=window_size)) - - # PROCESS MOTIF PWMS # - - print("Loading motifs from file ({f})".format(f=motif_file)) - if motif_format.lower() == "meme": - from srrTomat0.motifs.meme import read - elif motif_format.lower() == "transfac": - from srrTomat0.motifs.transfac import read - elif motif_format.lower() == "homer": - from srrTomat0.motifs.homer_motif import read - else: - raise ValueError("motif_format must be 'meme', 'homer', or 'transfac'") - - motifs = read(motif_file) - motif_information = motifs_to_dataframe(motifs) - print("\t{n} motifs loaded".format(n=len(motif_information))) - - if truncate_motifs is not None: - [x.truncate(threshold=truncate_motifs) for x in motifs] - - # SCAN CHROMATIN FOR MOTIFS # - - # Load and scan target chromatin peaks - print("Scanning target chromatin ({f_c}) for motifs ({f_m})".format(f_c=atac_bed_file, f_m=motif_file)) - - gene_locs = genes.loc[:, [GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND]].copy() - gene_locs[[SEQ_START, SEQ_STOP]] = gene_locs[[SEQ_START, SEQ_STOP]].astype(int) - - motif_peaks = MotifScan.scanner(motifs=motifs, num_workers=num_cores).scan(genomic_fasta_file, - atac_bed_file=atac_bed_file, - promoter_bed=gene_locs, - min_ic=motif_ic, - threshold=scanner_thresh) - - # PROCESS CHROMATIN PEAKS INTO NETWORK MATRIX # - - # Processing into prior - print("Processing TF binding sites into prior") - MotifScorer.set_information_criteria(min_binding_ic=motif_ic, max_dist=tandem) - prior_edges, prior_matrix, raw_matrix = build_prior_from_atac_motifs(genes, motif_peaks, motif_information, - num_workers=num_cores) - print("Prior matrix with {n} edges constructed".format(n=prior_edges.shape[0])) - return prior_edges, prior_matrix, raw_matrix - - -if __name__ == '__main__': - main() diff --git a/srrTomat0/ATACTomat0.py b/srrTomat0/ATACTomat0.py deleted file mode 100644 index a176dad..0000000 --- a/srrTomat0/ATACTomat0.py +++ /dev/null @@ -1,87 +0,0 @@ -import argparse -import os - -import pandas as pd - -from srrTomat0.processor.utils import file_path_abs -from srrTomat0.processor.srr import get_srr_files, unpack_srr_files -from srrTomat0.processor.star import star_align_fastqs -from srrTomat0.processor.samtools import sam_sort - -from srrTomat0 import SRR_SUBPATH, FASTQ_SUBPATH, STAR_ALIGNMENT_SUBPATH, BAM_SUBPATH - - -OUTPUT_MATRIX_FILE_NAME = "atac_matrix.tsv" - -COUNT_FILE_METAINDEXES = ["N_unmapped", "N_multimapping", "N_noFeature", "N_ambiguous"] -COUNT_FILE_HEADER = ["Total", "MinusStrand", "PlusStrand"] -COUNT_FILE_HEADER_FOR_OUTPUT = "Total" - - -def main(): - ap = argparse.ArgumentParser(description="Turn ATAC-seq expression SRRs from NCBI GEO into a prior matrix") - ap.add_argument("-s", "--srr", dest="srr", help="SRR record IDs", nargs="+", metavar="SRRID", default=None) - ap.add_argument("-f", "--file", dest="file", help="List of SRR records in a TXT file", metavar="FILE", default=None) - ap.add_argument("-g", "--genome", dest="genome", help="STAR reference genome", metavar="PATH", required=True) - ap.add_argument("-o", "--out", dest="out", help="Output PATH", metavar="PATH", required=True) - ap.add_argument("--gzip", dest="gzip", help="GZIP output file", action='store_const', const=True, default=False) - - args = ap.parse_args() - srr_ids = list() - - if args.srr is None and args.file is None: - print("One of --srr or --file must be set") - exit(1) - elif args.srr is not None and args.file is not None: - print("Only one of --srr or --file may be set (not both)") - exit(1) - elif args.srr is not None: - # SRR IDs are provided at command line - srr_ids = args.srr - elif args.file is not None: - # SRR IDs are in a .txt file; read them into a list - srr_ids = pd.read_csv(args.file, sep="\t", index_col=None, header=None).iloc[:, 0].tolist() - else: - raise ValueError("There is something wrong with this switch") - - atac_tomat0(srr_ids, args.out, args.genome, gzip_output=args.gzip) - - -def atac_tomat0(srr_ids, output_path, star_reference_genome, gzip_output=False, cores=4, star_jobs=2, star_args=None, - min_quality=None): - - star_args = [] if star_args is None else star_args - - output_path = file_path_abs(output_path) - os.makedirs(output_path, exist_ok=True) - - # Download all the SRR files - print("Downloading SRR files") - os.makedirs(os.path.join(output_path, SRR_SUBPATH), exist_ok=True) - srr_file_names = get_srr_files(srr_ids, os.path.join(output_path, SRR_SUBPATH), num_workers=cores) - - # Unpack all the SRR files into FASTQ files - print("Unpacking SRR files") - os.makedirs(os.path.join(output_path, FASTQ_SUBPATH), exist_ok=True) - fastq_file_names = unpack_srr_files(srr_ids, srr_file_names, os.path.join(output_path, FASTQ_SUBPATH), - num_workers=cores) - - # Run all the FASTQ files through STAR to align - print("Aligning FASTQ files") - os.makedirs(os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), exist_ok=True) - thread_count = max(int(cores / len(srr_ids)), int(cores / star_jobs)) - sam_file_names = star_align_fastqs(srr_ids, fastq_file_names, star_reference_genome, - os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), - num_workers=star_jobs, threads_per_worker=thread_count, star_options=star_args) - - # Sort all the SAM files into BAM files - print("Sorting SAM files into BAM files") - os.makedirs(os.path.join(output_path, BAM_SUBPATH), exist_ok=True) - bam_file_names = sam_sort(srr_ids, sam_file_names, os.path.join(output_path, BAM_SUBPATH), min_quality=min_quality, - num_workers=cores) - - -if __name__ == '__main__': - main() - - diff --git a/srrTomat0/ChIPBedTomat0.py b/srrTomat0/ChIPBedTomat0.py deleted file mode 100755 index 336b7cd..0000000 --- a/srrTomat0/ChIPBedTomat0.py +++ /dev/null @@ -1,145 +0,0 @@ -import argparse -import pandas as pd - -import pybedtools - -from srrTomat0.processor.gtf import load_gtf_to_dataframe, SEQ_START, SEQ_STOP, GTF_GENENAME, SEQ_TSS -from srrTomat0.processor.utils import file_path_abs -from srrTomat0.processor.bedtools import get_peaks_in_features - -# Column names -BED_CHROMOSOME = 'chrom' -SEQ_COUNTS = 'count' -SEQ_BIN = 'bin' - -# Quantiles for bin -PEAK_QUANTILES = [0.25, 0.5, 0.75, 1] - - -def main(): - ap = argparse.ArgumentParser(description="Load peaks and genes.") - ap.add_argument("-f", "--file", dest="file", help="TSV file with ID|BED PATH pairs", metavar="FILE", default=None) - ap.add_argument("-b", "--bed", dest="bed", help="BED file containing ChIP peaks", nargs="+", metavar="FILE", - default=None) - ap.add_argument("-a", "--annotation", dest="anno", help="GTF/GFF Annotation File", metavar="FILE", required=True) - ap.add_argument("-o", "--out", dest="out", help="Output TSV PATH", metavar="PATH", required=True) - ap.add_argument("-w", "--window", dest="window_size", help="Window size", type=int, default=0) - ap.add_argument("-g", "--genebody", dest="gene", help="Gene body", action="store_const", const=True, default=False) - ap.add_argument("-t", "--tss", dest="tss", help="Transcription start site", action="store_const", const=True, - default=False) - args = ap.parse_args() - - if args.bed is None and args.file is None: - print("One of --bed or --file must be set") - exit(1) - elif args.bed is not None and args.file is not None: - print("Only one of --bed or --file may be set (not both)") - exit(1) - elif args.bed is not None: - # SRR IDs are provided at command line - id_names = args.bed - chip_bed_files = args.bed - elif args.file is not None: - # SRR IDs are in a .txt file; read them into a list - chip_samples = pd.read_csv(args.file, sep="\t", index_col=None, header=None) - if chip_samples.shape[1] != 2: - print("The TSV file must have two columns: ID and File_Path") - id_names = chip_samples.iloc[:, 0].tolist() - chip_bed_files = chip_samples.iloc[:, 1].tolist() - else: - raise ValueError("There is something wrong with this switch") - - if args.gene is False and args.tss is False: - print("One of --genebody or --tss must be set") - exit(1) - elif args.gene is not False and args.tss is not False: - print("Only one of --genebody or --tss may be set (not both)") - exit(1) - - chip_bed_tomat0(id_names, chip_bed_files, args.anno, output_path=args.out, window_size=args.window_size, - gene_body_flag=args.gene, tss_flag=args.tss) - - -def chip_bed_tomat0(id_names, chip_peaks_file, annotation_file, output_path=None, window_size=0, gene_body_flag = False, - tss_flag = False): - """ - Process a BED file of peaks into a integer peak-count matrix - :param chip_peaks_file: list(str) - List of paths to a BED file - :param output_path: str - Path to the output TSV file - :param annotation_file: str - Path to the GTF annotation file - :param window_size: int - Window on each side of a gene to include a peak in the count - 100 means 100bp up from start and 100bp down from end - :return gene_counts: pd.DataFrame - Integer count matrix of peaks per gene - """ - - # Convert paths to absolutes - output_path = file_path_abs(output_path) - annotation_file = file_path_abs(annotation_file) - - # Load annotations into a dataframe with pybedtools - # Adjust the start and stop positions to account for a flanking window - genes = load_gtf_to_dataframe(annotation_file) - - if gene_body_flag: - genes = open_window(genes, window_size) - if tss_flag: - genes = open_tss(genes, window_size) - - - prior_data = pd.DataFrame(index=genes[GTF_GENENAME]) - for id_name, peak_file in zip(id_names, chip_peaks_file): - # Load BED file into a dataframe with pybedtools - peak_file = file_path_abs(peak_file) - chip_peaks = pybedtools.BedTool(peak_file).to_dataframe() - gene_counts = get_peaks_in_features(genes, chip_peaks) - - # Get non-zero quantiles and use them to bin peak overlap by length - quantiles = gene_counts.loc[gene_counts[SEQ_COUNTS] != 0, SEQ_COUNTS].quantile(PEAK_QUANTILES) - gene_counts[SEQ_BIN] = 0 - - for i, qval in enumerate(quantiles.sort_values(ascending=True)): - gene_counts.loc[gene_counts[SEQ_COUNTS] >= qval, SEQ_BIN] = i + 1 - - # Rename the column with ID and reindex for join - gene_counts = gene_counts.rename({SEQ_BIN: id_name}).set_index(GTF_GENENAME).drop([SEQ_COUNTS], axis=1) - prior_data = prior_data.join(gene_counts, on=[GTF_GENENAME]) - - if output_path is not None: - prior_data.to_csv(output_path, sep="\t") - - return prior_data - - -def open_window(annotation_dataframe, window_size): - """ - This needs to adjust the start and stop in the annotation dataframe with window sizes - :param annotation_dataframe: pd.DataFrame - :param window_size: int - :return windowed_dataframe: pd.DataFrame - """ - windowed_dataframe = annotation_dataframe.copy() - windowed_dataframe[SEQ_START] = windowed_dataframe[SEQ_START] - window_size - windowed_dataframe[SEQ_STOP] = windowed_dataframe[SEQ_STOP] + window_size - windowed_dataframe.loc[windowed_dataframe[SEQ_START] < 0, SEQ_START] = 0 - return windowed_dataframe - -def open_tss(annotation_file, window_size): - """ - This needs to adjust the start and stop in the annotation dataframe with window sizes - :param annotation_dataframe: pd.DataFrame - :param window_size: int - :return windowed_dataframe: pd.DataFrame - """ - tss_dataframe = annotation_file.copy() - tss_dataframe[SEQ_START] = tss_dataframe[SEQ_TSS] - window_size - tss_dataframe[SEQ_STOP] = tss_dataframe[SEQ_TSS] + window_size - tss_dataframe.loc[tss_dataframe[SEQ_START] < 0, SEQ_START] = 0 - return tss_dataframe - -if __name__ == '__main__': - main() diff --git a/srrTomat0/PWMToMEME.py b/srrTomat0/PWMToMEME.py deleted file mode 100644 index 0551c48..0000000 --- a/srrTomat0/PWMToMEME.py +++ /dev/null @@ -1,34 +0,0 @@ -from srrTomat0.motifs.pwm import read -from srrTomat0.motifs.meme import write - -import argparse -import glob -import os - - -def main(): - ap = argparse.ArgumentParser(description="Parse naked PWM files into a MEME file") - ap.add_argument("-m", "--motif", dest="motif", help="Motif PWM files", metavar="PATH", required=True, nargs="+") - ap.add_argument("-i", "--info", dest="info", help="Motif Info File", metavar="PATH", required=True) - ap.add_argument("-o", "--out", dest="out", help="Output FILE", metavar="FILE", required=True) - - args = ap.parse_args() - - files = [] - for mf in args.motif: - files.extend(glob.glob(os.path.expanduser(mf))) - - pwm_to_meme(files, args.info, args.out) - - -def pwm_to_meme(pwm_file_list, tf_info_file, output_file): - - print("Parsing {x} PWM files".format(x=len(pwm_file_list))) - motifs = read(pwm_file_list, tf_info_file, direct_only=True) - - print("Parsed {m} motifs, writing to file {f}".format(m=len(motifs), f=output_file)) - write(output_file, list(motifs)) - - -if __name__ == '__main__': - main() diff --git a/srrTomat0/RNASeqTomat0.py b/srrTomat0/RNASeqTomat0.py deleted file mode 100644 index 0cd61b8..0000000 --- a/srrTomat0/RNASeqTomat0.py +++ /dev/null @@ -1,134 +0,0 @@ -from __future__ import print_function - -import os - -import pandas as pd - -from srrTomat0 import SRR_SUBPATH, FASTQ_SUBPATH, STAR_ALIGNMENT_SUBPATH, HTSEQ_ALIGNMENT_SUBPATH -from srrTomat0.processor.htseq_count import htseq_count_aligned -from srrTomat0.processor.matrix import pileup_raw_counts, normalize_matrix_to_fpkm, normalize_matrix_to_tpm -from srrTomat0.processor.srr import get_srr_files, unpack_srr_files -from srrTomat0.processor.star import star_align_fastqs -from srrTomat0.processor.utils import file_path_abs, test_requirements_exist, ArgParseTestRequirements - -OUTPUT_COUNT_FILE_NAME = "srr_counts.tsv" -OUTPUT_COUNT_METADATA_NAME = "srr_alignment_metadata.tsv" -OUTPUT_FPKM_FILE_NAME = "srr_fpkm.tsv" -OUTPUT_TPM_FILE_NAME = "srr_tpm.tsv" - - -def main(): - ap = ArgParseTestRequirements(description="Turn a list of RNAseq expression SRRs from NCBI GEO into a count matrix") - ap.add_argument("-s", "--srr", dest="srr", help="SRR record IDs", nargs="+", metavar="SRRID", default=None) - ap.add_argument("-f", "--file", dest="file", help="List of SRR records in a TXT file", metavar="FILE", default=None) - ap.add_argument("-g", "--genome", dest="genome", help="STAR reference genome", metavar="PATH", required=True) - ap.add_argument("-a", "--annotation", dest="anno", help="GTF/GFF Annotation File", metavar="FILE", required=True) - ap.add_argument("-o", "--out", dest="out", help="Output PATH", metavar="PATH", required=True) - ap.add_argument("--gzip", dest="gzip", help="GZIP output file", action='store_const', const=True, default=False) - ap.add_argument("--cpu", dest="cpu", help="NUM of cores to use", metavar="NUM", type=int, default=4) - ap.add_argument("--star_jobs", dest="sjob", help="NUM of STAR workers to use", metavar="NUM", type=int, default=4) - - args, star_args = ap.parse_known_args() - test_requirements_exist() - - srr_ids = list() - - if args.srr is None and args.file is None: - print("One of --srr or --file must be set") - exit(1) - elif args.srr is not None and args.file is not None: - print("Only one of --srr or --file may be set (not both)") - exit(1) - elif args.srr is not None: - # SRR IDs are provided at command line - srr_ids = args.srr - elif args.file is not None: - # SRR IDs are in a .txt file; read them into a list - srr_ids = pd.read_csv(args.file, sep="\t", index_col=None, header=None).iloc[:, 0].tolist() - else: - raise ValueError("There is something wrong with this switch") - - srr_tomat0(srr_ids, args.out, args.genome, args.anno, gzip_output=args.gzip, cores=args.cpu, star_jobs=args.sjob, - star_args=star_args) - - -def srr_tomat0(srr_ids, output_path, star_reference_genome, annotation_file, gzip_output=False, cores=4, star_jobs=2, - star_args=None): - star_args = [] if star_args is None else star_args - - output_path = file_path_abs(output_path) - os.makedirs(output_path, exist_ok=True) - - # Download all the SRR files - print("Downloading SRR files") - os.makedirs(os.path.join(output_path, SRR_SUBPATH), exist_ok=True) - srr_file_names = get_srr_files(srr_ids, os.path.join(output_path, SRR_SUBPATH), num_workers=cores) - - # Unpack all the SRR files into FASTQ files - print("Unpacking SRR files") - os.makedirs(os.path.join(output_path, FASTQ_SUBPATH), exist_ok=True) - fastq_file_names = unpack_srr_files(srr_ids, srr_file_names, os.path.join(output_path, FASTQ_SUBPATH), - num_workers=cores) - - # Run all the FASTQ files through STAR to align - print("Aligning FASTQ files") - os.makedirs(os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), exist_ok=True) - thread_count = max(int(cores / len(srr_ids)), int(cores / star_jobs)) - sam_file_names = star_align_fastqs(srr_ids, fastq_file_names, star_reference_genome, - os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), - num_workers=star_jobs, threads_per_worker=thread_count, star_options=star_args) - - # Run all the SAM files through HTSeq.count to count - print("Counting SAM alignments") - os.makedirs(os.path.join(output_path, HTSEQ_ALIGNMENT_SUBPATH), exist_ok=True) - count_file_names = htseq_count_aligned(srr_ids, sam_file_names, annotation_file, - os.path.join(output_path, HTSEQ_ALIGNMENT_SUBPATH), num_workers=cores) - - # Convert the count files into a matrix and save it to a TSV - print("Assembling result matrix") - count_matrix, count_metadata = pileup_raw_counts(srr_ids, count_file_names) - count_matrix_file_name = os.path.join(output_path, OUTPUT_COUNT_FILE_NAME) - - # Save the raw counts file - if gzip_output: - count_matrix.to_csv(count_matrix_file_name + ".gz", compression='gzip', sep="\t") - else: - count_matrix.to_csv(count_matrix_file_name, sep="\t") - - # Save the count metadata file - count_metadata.to_csv(os.path.join(output_path, OUTPUT_COUNT_METADATA_NAME), sep="\t") - - # Normalize to FPKM - print("Normalizing result matrix to FPKM") - normalized_count_matrix_fpkm = normalize_matrix_to_fpkm(count_matrix, annotation_file) - fpkm_file_name = os.path.join(output_path, OUTPUT_FPKM_FILE_NAME) - - # Save the normalized counts file - if gzip_output: - normalized_count_matrix_fpkm.to_csv(fpkm_file_name + ".gz", compression='gzip', sep="\t") - else: - normalized_count_matrix_fpkm.to_csv(fpkm_file_name, sep="\t") - - # Normalize to TPM - print("Normalizing result matrix to TPM") - normalized_count_matrix_tpm = normalize_matrix_to_tpm(count_matrix, annotation_file) - tpmx_file_name = os.path.join(output_path, OUTPUT_TPM_FILE_NAME) - - # Save the normalized counts file - if gzip_output: - normalized_count_matrix_tpm.to_csv(tpmx_file_name + ".gz", compression='gzip', sep="\t") - else: - normalized_count_matrix_tpm.to_csv(tpmx_file_name, sep="\t") - - print("Count file {sh} generated from {srlen} SRA files".format(sh=count_matrix.shape, srlen=len(srr_ids))) - failed_counts = list(map(lambda x: x is None, count_file_names)) - - if any(failed_counts): - print("{n} Sequence Records could not be counted:".format(n=sum(failed_counts)), end="") - print("\n\t".join([sid for sid, fail in zip(srr_ids, failed_counts) if fail])) - - return count_matrix - - -if __name__ == '__main__': - main() diff --git a/srrTomat0/__init__.py b/srrTomat0/__init__.py deleted file mode 100644 index cf24741..0000000 --- a/srrTomat0/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -import os - -__version__ = '0.1.0' - -# Paths to the executable files -# Defaults assume that executables are in the shell path - -STAR_EXECUTABLE_PATH = os.path.expanduser("STAR") -PREFETCH_EXECUTABLE_PATH = os.path.expanduser("prefetch") -FASTQDUMP_EXECUTABLE_PATH = os.path.expanduser("fastq-dump") -CHROMA_EXECUTABLE_PATH = os.path.expanduser("ChromA") -SAMTOOLS_EXECUTABLE_PATH = os.path.expanduser("samtools") -FIMO_EXECUTABLE_PATH = os.path.expanduser("fimo") -HOMER_EXECUTABLE_PATH = os.path.expanduser("homer2") -BEDTOOLS_EXECUTABLE_PATH = os.path.expanduser("bedtools") - -HTSEQ_MODULE_NAME = "HTSeq.scripts.count" -CHROMA_MODULE_NAME = "ChromA" - -SRR_SUBPATH = "SRR" -FASTQ_SUBPATH = "FASTQ" -STAR_ALIGNMENT_SUBPATH = "STAR" -HTSEQ_ALIGNMENT_SUBPATH = "HTSEQ" -BAM_SUBPATH = "BAM" -FIMO_SUBPATH = "FIMO" diff --git a/srrTomat0/motifs/__init__.py b/srrTomat0/motifs/__init__.py deleted file mode 100644 index 57996db..0000000 --- a/srrTomat0/motifs/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from srrTomat0.motifs._motif import (Motif, motifs_to_dataframe, chunk_motifs, - INFO_COL, MOTIF_COL, ENTROPY_COL, LEN_COL, OCC_COL, MOTIF_NAME_COL, SCAN_SCORE_COL, - SCORE_PER_BASE) -from srrTomat0.motifs.motif_scan import MotifScan diff --git a/srrTomat0/motifs/_motif.py b/srrTomat0/motifs/_motif.py deleted file mode 100644 index f71ba43..0000000 --- a/srrTomat0/motifs/_motif.py +++ /dev/null @@ -1,384 +0,0 @@ -import numpy as np -import pandas as pd -import warnings -import os -import tempfile -import math -import itertools -import pathos -from collections import Counter - -from srrTomat0.processor.bedtools import extract_bed_sequence, intersect_bed, load_bed_to_bedtools - -INFO_COL = "Information Content" -ENTROPY_COL = "Shannon Entropy" -OCC_COL = "Occurrence" -LEN_COL = "Length" -MOTIF_COL = "Motif_ID" -MOTIF_NAME_COL = "Motif_Name" - -SCAN_SCORE_COL = "Tomat0_Score" -SCORE_PER_BASE = "Per Base Array" - - -class Motif: - motif_id = None - motif_name = None - motif_url = None - - _motif_probs = None - _motif_counts = None - _motif_prob_array = None - _motif_alphabet = None - _motif_background = None - _motif_species = None - _motif_accession = None - _alphabet_map = None - _consensus_seq = None - _info_matrix = None - _homer_odds = None - - @property - def alphabet(self): - return self._motif_alphabet - - @alphabet.setter - def alphabet(self, new_alphabet): - if new_alphabet is not None: - self._motif_alphabet = new_alphabet - self._alphabet_map = {ch.lower(): i for i, ch in enumerate(self._motif_alphabet)} - - @property - def accession(self): - return self._motif_accession - - @accession.setter - def accession(self, new_accession): - if new_accession is not None: - self._motif_accession = new_accession - - @property - def id(self): - return self.motif_id - - @id.setter - def id(self, new_id): - if new_id is not None: - self.motif_id = new_id - - @property - def name(self): - return self.motif_name - - @name.setter - def name(self, new_name): - if new_name is not None: - self.motif_name = new_name - - @property - def alphabet_len(self): - return len(self._motif_alphabet) - - @property - def background(self): - if self._motif_background is None: - self._motif_background = np.array([[1 / self.alphabet_len] * self.alphabet_len]) - return self._motif_background - - @property - def probability_matrix(self): - if self._motif_prob_array is None and len(self._motif_probs) == 0: - return None - if self._motif_prob_array is None or self._motif_prob_array.shape[0] < len(self._motif_probs): - self._motif_prob_array = np.array(self._motif_probs) - return self._motif_prob_array - - @probability_matrix.setter - def probability_matrix(self, matrix): - self._motif_prob_array = matrix - - @property - def count_matrix(self): - return np.array(self._motif_counts) if self._motif_counts is not None else None - - @property - def shannon_entropy(self): - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=RuntimeWarning) - - # Calculate -1 * p log p and set to 0 where p is already 0 - entropy = np.multiply(self.probability_matrix, np.log2(self.probability_matrix)) - entropy[~np.isfinite(entropy)] = 0 - entropy *= -1 - - return np.sum(entropy) - - @property - def information_content(self): - if self.probability_matrix is None: - return 0 - - return np.sum(self.ic_matrix) - - @property - def homer_odds(self): - return self.threshold_ln_odds if self._homer_odds is None else self._homer_odds - - @homer_odds.setter - def homer_odds(self, val): - self._homer_odds = val - - @property - def ic_matrix(self): - if self.probability_matrix is None: - return None - - if self._info_matrix is None or self._info_matrix.shape != self.probability_matrix.shape: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=RuntimeWarning) - - # Calculate p log (p/background) - self._info_matrix = np.divide(self.probability_matrix, self.background.reshape(1, -1)) - self._info_matrix = np.multiply(self.probability_matrix, np.log2(self._info_matrix)) - self._info_matrix[~np.isfinite(self._info_matrix)] = 0 - - return self._info_matrix - - @property - def expected_occurrence_rate(self): - return int(2 ** self.information_content) - - @property - def consensus(self): - if self._consensus_seq is None: - self._consensus_seq = "".join(np.apply_along_axis(lambda x: self.alphabet[x.argmax()], axis=1, - arr=self.probability_matrix)) - return self._consensus_seq - - @property - def max_ln_odds(self): - max_ln_odd = np.log(np.amax(self.probability_matrix, axis=1) / 0.25) - return np.sum(max_ln_odd) - - @property - def threshold_ln_odds(self): - second_prob = np.sort(self.probability_matrix, axis=1)[:, 2] - return self.max_ln_odds - max((np.sum(np.log(second_prob[second_prob > 0.25] / 0.25)), 0.1 * self.max_ln_odds)) - - @property - def species(self): - return self._motif_species - - @species.setter - def species(self, new_species): - is_list = isinstance(new_species, (list, tuple)) - - if is_list and self._motif_species is None: - self._motif_species = new_species - elif is_list: - self._motif_species.extend(new_species) - elif self._motif_species is None: - self._motif_species = [new_species] - else: - self._motif_species.append(new_species) - - def __len__(self): - return self.probability_matrix.shape[0] if self.probability_matrix is not None else 0 - - def __str__(self): - return "{mid} {mname}: Width {el} IC {ic:.2f} bits".format(mid=self.motif_id, - mname=self.motif_name, - el=len(self), - ic=self.information_content) - - def __init__(self, motif_id=None, motif_name=None, motif_alphabet=None, motif_background=None): - self.id = motif_id - self.name = motif_name - self.alphabet = motif_alphabet - self._motif_background = motif_background - self._motif_probs = [] - - def add_prob_line(self, line): - self._motif_probs.append(line) - - def add_count_line(self, line): - if self._motif_counts is not None: - self._motif_counts.append(line) - else: - self._motif_counts = [line] - - def score_match(self, match, disallow_homopolymer=True, homopolymer_one_off_len=6, score_zero_as_zero=1): - - if len(match) != len(self): - msg = "Sequence length {l} not compatible with motif length {m}".format(l=len(match), m=len(self)) - raise ValueError(msg) - - # Score anything that's a homopolymer to 0 if the flag is set - if disallow_homopolymer and sum([m == match[0] for m in match]) == len(match): - return 0 - - # Score anything that's one base from a homopolymer to 0 if the flag is set - if disallow_homopolymer and (len(match) > homopolymer_one_off_len and - sum([min((c, 2)) for c in Counter(match).values()]) < 4): - return 0 - - # Score anything with excessive nucleotides that have a p ~ 0.0 as 0 - if score_zero_as_zero is not None and sum(p < 0.001 for p in self._prob_match(match)) > score_zero_as_zero: - return 0 - - mse_ic = np.sum(np.square(np.subtract(self._info_match(self.consensus), self._info_match(match)))) - return max((self.information_content - mse_ic, 0)) - - def truncate(self, threshold=0.35): - threshold = np.max(self.probability_matrix, axis=1) > threshold - keepers = (threshold.cumsum() > 0) & (threshold[::-1].cumsum()[::-1] > 0) - self.probability_matrix = self.probability_matrix[keepers, :] - self._motif_probs = list(itertools.compress(self._motif_probs, keepers)) - - def _prob_match(self, match): - return [self.probability_matrix[i, self._alphabet_map[ch.lower()]] for i, ch in enumerate(match)] - - def _info_match(self, match): - return [self.ic_matrix[i, self._alphabet_map[ch.lower()]] for i, ch in enumerate(match)] - - def species_contains(self, match_str): - if self.species is not None: - match_str = match_str.lower() - return any(match_str in s.lower() for s in self.species) - else: - return False - - -class __MotifScanner: - - def __init__(self, motif_file=None, motifs=None, num_workers=4): - - if (motif_file is None and motifs is None) or (motif_file is not None and motifs is not None): - raise ValueError("One of meme_file or motifs must be passed") - - self.motif_file = motif_file - self.motifs = motifs - self.num_workers = num_workers - - def scan(self, genome_fasta_file, atac_bed_file=None, promoter_bed=None, min_ic=None, threshold=None): - """ - """ - - # Preprocess motifs into a list of temp chunk files - motif_files = self._preprocess(min_ic=min_ic) - # Unpack list to a dict for convenience - self.motifs = {mot.motif_id: mot for mot in self.motifs} - - try: - if atac_bed_file is None and promoter_bed is None: - motif_data = self._scan_extract(motif_files, genome_fasta_file, threshold=threshold) - return self._postprocess(motif_data) - elif atac_bed_file is not None and promoter_bed is None: - bed_file = load_bed_to_bedtools(atac_bed_file) - elif atac_bed_file is None and promoter_bed is not None: - bed_file = load_bed_to_bedtools(promoter_bed) - else: - bed_file = intersect_bed(load_bed_to_bedtools(atac_bed_file), load_bed_to_bedtools(promoter_bed)) - - extracted_fasta_file = extract_bed_sequence(bed_file, genome_fasta_file) - - try: - motif_data = self._scan_extract(motif_files, extracted_fasta_file, threshold=threshold) - return self._postprocess(motif_data) - finally: - try: - os.remove(extracted_fasta_file) - except FileNotFoundError: - pass - - finally: - for file in motif_files: - try: - os.remove(file) - except FileNotFoundError: - pass - - def _scan_extract(self, motif_files, extracted_fasta_file, threshold=None): - # If the number of workers is 1, run fimo directly - if self.num_workers == 1: - assert len(motif_files) == 1 - return self._get_motifs(extracted_fasta_file, motif_files[0], threshold=threshold) - - # Otherwise parallelize with a process pool (pathos because dill will do local functions) - else: - # Convenience local function - def _get_chunk_motifs(chunk_file): - return self._get_motifs(extracted_fasta_file, chunk_file, threshold=threshold) - - with pathos.multiprocessing.Pool(self.num_workers) as pool: - motif_data = [data for data in pool.imap(_get_chunk_motifs, motif_files)] - motif_data = pd.concat(motif_data) - - return motif_data - - def _preprocess(self, min_ic=None): - raise NotImplementedError - - def _postprocess(self, motif_peaks): - raise NotImplementedError - - def _get_motifs(self, fasta_file, motif_file, threshold=None): - raise NotImplementedError - - def _parse_output(self, output_handle): - raise NotImplementedError - - -def motifs_to_dataframe(motifs): - entropy = list(map(lambda x: x.shannon_entropy, motifs)) - occurrence = list(map(lambda x: x.expected_occurrence_rate, motifs)) - info = list(map(lambda x: x.information_content, motifs)) - ids = list(map(lambda x: x.motif_id, motifs)) - names = list(map(lambda x: x.motif_name, motifs)) - - df = pd.DataFrame( - [ids, names, info, entropy, occurrence, list(map(lambda x: len(x), motifs))], - columns=list(map(lambda x: x.motif_name, motifs)), - index=[MOTIF_COL, MOTIF_NAME_COL, INFO_COL, ENTROPY_COL, OCC_COL, LEN_COL]).T - - return df - - -def chunk_motifs(file_type, motifs, num_workers=4, min_ic=None): - """ - Break a motif file up into chunks - :param file_type: The meme or homer namespaces with a .read() and .write() function - :type file_type: srrTomat0.motifs parser - :param motifs: Motif object list; pass either meme_file or motifs - :type motifs: list(Motif), None - :param num_workers: number of chunks to make - :type num_workers: int - :param min_ic: set an information content minimum on motifs to include if this is not None - :type min_ic: float - :return: List of chunked motif files - :rtype: list - """ - - temp_dir = tempfile.gettempdir() - - if min_ic is not None: - motifs = list(itertools.compress(motifs, [m.information_content >= min_ic for m in motifs])) - - if num_workers == 1: - file_name = os.path.join(temp_dir, "chunk1.mchunk") - file_type.write(file_name, motifs) - return [file_name] - - num_workers = len(motifs) if num_workers > len(motifs) else num_workers - chunk_index = np.repeat(np.arange(num_workers).reshape(1, -1), np.ceil(len(motifs) / num_workers), axis=0).flatten() - chunk_index = chunk_index[0:len(motifs)] - - files = [] - - for i in range(num_workers): - file_name = os.path.join(temp_dir, "chunk" + str(i) + ".mchunk") - file_type.write(file_name, [m for m, b in zip(motifs, (chunk_index == i)) if b]) - files.append(file_name) - - return files diff --git a/srrTomat0/motifs/fimo.py b/srrTomat0/motifs/fimo.py deleted file mode 100644 index 7620265..0000000 --- a/srrTomat0/motifs/fimo.py +++ /dev/null @@ -1,68 +0,0 @@ -import io -import subprocess -import pandas as pd -import numpy as np -import pandas.errors as pde - -from srrTomat0 import FIMO_EXECUTABLE_PATH -from srrTomat0.motifs import meme, chunk_motifs, SCAN_SCORE_COL, SCORE_PER_BASE -from srrTomat0.motifs._motif import __MotifScanner - -FIMO_DATA_SUFFIX = ".fimo.tsv" - -FIMO_MOTIF = 'motif_id' -FIMO_MOTIF_COMMON = 'motif_alt_id' -FIMO_CHROMOSOME = 'sequence_name' -FIMO_STRAND = 'strand' -FIMO_START = 'start' -FIMO_STOP = 'stop' -FIMO_SCORE = 'p-value' -FIMO_SEQUENCE = 'matched_sequence' - -FIMO_COMMAND = [FIMO_EXECUTABLE_PATH, "--text", "--parse-genomic-coord"] - - -class FIMOScanner(__MotifScanner): - - def _preprocess(self, min_ic=None): - if self.motif_file is not None: - self.motifs = meme.read(self.motif_file) - - return chunk_motifs(meme, self.motifs, num_workers=self.num_workers, min_ic=min_ic) - - def _postprocess(self, motif_peaks): - motif_peaks = motif_peaks.drop_duplicates(subset=[FIMO_MOTIF, FIMO_START, FIMO_STOP, FIMO_CHROMOSOME]) - return motif_peaks - - def _get_motifs(self, fasta_file, motif_file, threshold=None): - if threshold is None: - fimo_command = FIMO_COMMAND + [motif_file, fasta_file] - else: - fimo_command = FIMO_COMMAND + ["--thresh", str(threshold)] + [motif_file, fasta_file] - - proc = subprocess.run(fimo_command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - - if int(proc.returncode) != 0: - print("fimo motif scan failed for {meme}, {fa} ({cmd})".format(meme=motif_file, - fa=fasta_file, - cmd=" ".join(fimo_command))) - - return self._parse_output(io.StringIO(proc.stdout.decode("utf-8"))) - - def _parse_output(self, output_handle): - try: - motifs = pd.read_csv(output_handle, sep="\t", index_col=None) - motifs.dropna(subset=[FIMO_START, FIMO_STOP], inplace=True, how='any') - motifs[FIMO_START], motifs[FIMO_STOP] = motifs[FIMO_START].astype(int), motifs[FIMO_STOP].astype(int) - - if "#pattern name" in motifs.columns: - raise RuntimeError("FIMO version not supported; update to 5.0.5") - - motifs[SCAN_SCORE_COL] = [self.motifs[x].score_match(y) for x, y in - zip(motifs[FIMO_MOTIF], motifs[FIMO_SEQUENCE])] - motifs[SCORE_PER_BASE] = [np.array(self.motifs[x]._info_match(y)) for x, y in - zip(motifs[FIMO_MOTIF], motifs[FIMO_SEQUENCE])] - - return motifs - except pde.EmptyDataError: - return None diff --git a/srrTomat0/motifs/homer.py b/srrTomat0/motifs/homer.py deleted file mode 100644 index 156d100..0000000 --- a/srrTomat0/motifs/homer.py +++ /dev/null @@ -1,70 +0,0 @@ -import subprocess -import io -import pandas as pd -import numpy as np - -from srrTomat0.motifs import chunk_motifs, homer_motif, SCAN_SCORE_COL, SCORE_PER_BASE -from srrTomat0.motifs._motif import __MotifScanner -from srrTomat0 import HOMER_EXECUTABLE_PATH - -HOMER_DATA_SUFFIX = ".homer.tsv" - -HOMER_SEQ_ID = 'seqid' -HOMER_OFFSET = 'offset' -HOMER_MATCH = 'match' -HOMER_MOTIF = 'motif_id' -HOMER_STRAND = 'strand' -HOMER_SCORE = 'score' -HOMER_CHROMOSOME = 'sequence_name' -HOMER_START = 'start' -HOMER_STOP = 'stop' - -HOMER2_FIND_COLS = [HOMER_SEQ_ID, HOMER_OFFSET, HOMER_MATCH, HOMER_MOTIF, HOMER_STRAND, HOMER_SCORE] -HOMER2_EXPAND_STR_COLS = [HOMER_CHROMOSOME, HOMER_START, HOMER_STOP] - - -class HOMERScanner(__MotifScanner): - - def _preprocess(self, min_ic=None): - if self.motif_file is not None: - self.motifs = homer_motif.read(self.motif_file) - - return chunk_motifs(homer_motif, self.motifs, num_workers=self.num_workers, min_ic=min_ic) - - def _postprocess(self, motif_peaks): - motif_peaks = motif_peaks.drop_duplicates(subset=[HOMER_MOTIF, HOMER_START, HOMER_STOP, HOMER_CHROMOSOME]) - return motif_peaks - - def _get_motifs(self, fasta_file, motif_file, threshold=None): - homer_command = [HOMER_EXECUTABLE_PATH, "find", "-i", fasta_file, "-m", motif_file, "-offset", str(0)] - proc = subprocess.run(homer_command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - - if int(proc.returncode) != 0: - print("HOMER motif scan failed for {meme}, {fa} (cmd)".format(meme=motif_file, - fa=fasta_file, - cmd=" ".join(homer_command))) - - return self._parse_output(io.StringIO(proc.stdout.decode("utf-8"))) - - def _parse_output(self, output_handle): - motifs = pd.read_csv(output_handle, sep="\t", index_col=None, names=HOMER2_FIND_COLS) - - loc_data = motifs[HOMER_SEQ_ID].str.split(r"[\:\-]", expand=True) - loc_data.columns = HOMER2_EXPAND_STR_COLS if loc_data.shape[1] == 3 else HOMER2_EXPAND_STR_COLS + ["UNKNOWN"] - loc_data[HOMER_START] = loc_data[HOMER_START].astype(int) + motifs[HOMER_OFFSET] - - match_width = motifs[HOMER_MATCH].str.len() - - loc_data.loc[motifs[HOMER_STRAND] == "-", HOMER_START] -= match_width.loc[motifs[HOMER_STRAND] == "-"] - 1 - - loc_data[HOMER_STOP] = loc_data[HOMER_START] + motifs[HOMER_MATCH].str.len() - - motifs[[HOMER_CHROMOSOME, HOMER_START, HOMER_STOP]] = loc_data[[HOMER_CHROMOSOME, HOMER_START, HOMER_STOP]] - motifs.drop([HOMER_SEQ_ID, HOMER_OFFSET], inplace=True, axis=1) - - motifs[SCAN_SCORE_COL] = [self.motifs[x].score_match(y) for x, y in - zip(motifs[HOMER_MOTIF], motifs[HOMER_MATCH])] - motifs[SCORE_PER_BASE] = [np.array(self.motifs[x]._info_match(y)) for x, y in - zip(motifs[HOMER_MOTIF], motifs[HOMER_MATCH])] - - return motifs diff --git a/srrTomat0/motifs/homer_motif.py b/srrTomat0/motifs/homer_motif.py deleted file mode 100644 index 621e8e1..0000000 --- a/srrTomat0/motifs/homer_motif.py +++ /dev/null @@ -1,71 +0,0 @@ -from srrTomat0.motifs import Motif - -HOMER_MOTIF_RECORD = """\ ->{consensus}\t{mname}\t{odds_score:.6f} -{pmatrix}""" - -HOMER_ALPHABET = "ACGT" - - -def read(file_descript): - - # Parse if it's a string - if isinstance(file_descript, str): - with open(file_descript) as motif_fh: - return [m for m in __parse_motif_gen(motif_fh)] - - # Parse if it's a file handle - else: - return [m for m in __parse_motif_gen(file_descript)] - - -def write(file_descript, motifs, alphabet=None, background=None, mode="w"): - - motifs = [motifs] if not isinstance(motifs, list) else motifs - - # Write if it's a string - if isinstance(file_descript, str): - with open(file_descript, mode=mode) as motif_fh: - for motif in motifs: - __write_motif(motif_fh, motif) - - # Write if it's a file handle - else: - for motif in motifs: - __write_motif(file_descript, motif) - - -def __parse_motif_gen(handle): - - active_motif = None - - for line in handle: - line = line.strip() - - if len(line) > 0 and line.lower().startswith(">"): - if active_motif is not None: - yield active_motif - line = line.split() - active_motif = Motif(line[1], None, list(HOMER_ALPHABET)) - active_motif.homer_odds = line[2] - elif len(line) > 0: - probs = line.split() - if active_motif is not None and len(probs) == len(HOMER_ALPHABET): - active_motif.add_prob_line(list(map(lambda x: float(x), probs))) - - if active_motif is not None: - yield active_motif - - -def __write_motif(motif_fh, motif): - - if motif.alphabet is not None and "".join(motif.alphabet).upper() != HOMER_ALPHABET: - raise ValueError("HOMER requires ACGT alphabet only") - - p_mat = "\n".join(["\t".join(map(lambda x: " {:.4f}".format(x), r)) for r in motif.probability_matrix]) - - record = HOMER_MOTIF_RECORD.format(consensus=motif.consensus, - mname=motif.motif_id, - odds_score=motif.homer_odds, - pmatrix=p_mat) - print(record, file=motif_fh) diff --git a/srrTomat0/motifs/meme.py b/srrTomat0/motifs/meme.py deleted file mode 100644 index 7e3a758..0000000 --- a/srrTomat0/motifs/meme.py +++ /dev/null @@ -1,179 +0,0 @@ -from srrTomat0.motifs import Motif - -import numpy as np - -MEME4_HEADER = """\ -MEME version 4 - -ALPHABET= {alphabet} - -strands: {strands} - -Background letter frequencies: -{bkgd} -""" - -MEME4_RECORD = """\ -MOTIF {motif_id} {motif_name} - -letter-probability matrix: alength= {alen} w= {w} -{pmatrix} - -URL {url} -""" - - -def read(file_descript): - - # Parse if it's a string - if isinstance(file_descript, str): - with open(file_descript) as motif_fh: - return [m for m in __parse_meme_file(motif_fh)] - - # Parse if it's a file handle - else: - return [m for m in __parse_meme_file(file_descript)] - - -def write(file_descript, motifs, alphabet=None, background=None, mode="w"): - - motifs = [motifs] if not isinstance(motifs, list) else motifs - alphabet = alphabet if alphabet is not None else motifs[0].alphabet - background = np.array([[1 / len(alphabet)] * len(alphabet)]) if background is None else background - - def _write_file(fh): - __write_header(fh, alphabet, background) - for motif in motifs: - __write_motif(fh, motif) - - # Write if it's a string - if isinstance(file_descript, str): - with open(file_descript, mode=mode) as motif_fh: - _write_file(motif_fh) - - # Write if it's a file handle - else: - _write_file(file_descript) - - -def __parse_meme_file(meme_fh): - alph = __parse_alphabet(meme_fh) - - neg_strand, pos_strand = __parse_strand(meme_fh, strict=False) - - bkgd = __parse_background(meme_fh, strict=False) - bkgd = np.array([[1 / len(alph)] * len(alph)]) if bkgd is None else np.array([[bkgd[a] for a in alph]]) - - return [m for m in __parse_motif_gen(meme_fh, alph, bkgd)] - - -def __parse_alphabet(handle, strict=True): - - for line in handle: - if line.strip().lower().startswith("alphabet"): - handle.seek(0) - return list(line.strip().split()[-1]) - - if strict: - raise MEMEDatabaseError("Unable to locate `ALPHABET =` line") - - -def __parse_strand(handle, strict=True): - - for line in handle: - if line.strip().lower().startswith("strands"): - handle.seek(0) - strands = "".join(line.strip().split()[-2:]) - return "-" in strands, "+" in strands - - if strict: - raise MEMEDatabaseError("Unable to locate `ALPHABET =` line") - else: - handle.seek(0) - return True, True - - -def __parse_background(handle, strict=True): - - find_flag = False - - for line in handle: - line = line.strip() - if line.lower().startswith("background"): - find_flag = True - continue - if len(line) > 0 and find_flag: - probs = line.split() - - if len(probs) % 2 != 0: - raise MEMEDatabaseError("Background probabilities do not parse correctly") - - handle.seek(0) - return {a: float(b) for a, b in zip(probs[::2], probs[1::2])} - - if strict: - raise MEMEDatabaseError("Unable to locate background probabilities") - else: - return None - - -def __parse_motif_gen(handle, alphabet, background): - - active_motif = None - - for line in handle: - line = line.strip() - - if active_motif is None and line.lower().startswith("motif"): - line = line.split() - active_motif = Motif(line[1], line[2] if len(line) > 2 else None, alphabet, background) - continue - elif active_motif is not None and line.lower().startswith("motif"): - yield active_motif - line = line.split() - active_motif = Motif(line[1], line[2] if len(line) > 2 else None, alphabet, background) - continue - - if line.lower().startswith("letter-probability") or len(line) == 0: - continue - - if line.lower().startswith("url") and active_motif is not None: - active_motif.motif_url = line.split()[-1].strip() - continue - - probs = line.split() - if active_motif is not None and len(probs) == len(alphabet): - active_motif.add_prob_line(list(map(lambda x: float(x), probs))) - - if active_motif is not None: - yield active_motif - - -def __write_header(handle, alphabet, bkgd, pos_strand=True, neg_strand=True): - - strands = ["-"] if neg_strand and not pos_strand else ['+'] if pos_strand and not neg_strand else ["-", "+"] - bkgd = ["{} {:.5f}".format(a, b) for a, b in zip(alphabet, bkgd.flatten().tolist())] - - meme4_header = MEME4_HEADER.format(alphabet="".join(alphabet), - strands=" ".join(strands), - bkgd=" ".join(bkgd)) - - print(meme4_header, file=handle) - - -def __write_motif(handle, motif): - - p_mat = "\n".join(["\t".join(map(lambda x: " {:.6f}".format(x), r)) for r in motif.probability_matrix]) - - meme4_record = MEME4_RECORD.format(motif_id=motif.motif_id if motif.motif_id is not None else "", - motif_name=motif.motif_name if motif.motif_name is not None else "", - alen=motif.alphabet_len, - w=len(motif), - pmatrix=p_mat, - url=motif.motif_url if motif.motif_url is not None else "") - - print(meme4_record, file=handle) - - -class MEMEDatabaseError(ValueError): - pass diff --git a/srrTomat0/motifs/motif_scan.py b/srrTomat0/motifs/motif_scan.py deleted file mode 100644 index 68bbbbb..0000000 --- a/srrTomat0/motifs/motif_scan.py +++ /dev/null @@ -1,49 +0,0 @@ -from srrTomat0.motifs.fimo import FIMO_MOTIF, FIMO_SCORE, FIMO_START, FIMO_STOP, FIMO_CHROMOSOME, FIMOScanner -from srrTomat0.motifs.homer import HOMER_MOTIF, HOMER_SCORE, HOMER_START, HOMER_STOP, HOMER_CHROMOSOME, HOMERScanner -from srrTomat0.motifs import meme -from srrTomat0.motifs import homer_motif - - -class MotifScan(object): - """ - This class handles keeping track of the info needed for each type of motif scanner - """ - - _motif_file_type = 'fimo' - - name_col = FIMO_MOTIF - score_col = FIMO_SCORE - chromosome_col = FIMO_CHROMOSOME - start_col = FIMO_START - stop_col = FIMO_STOP - scanner = FIMOScanner - - @classmethod - def set_type_fimo(cls): - cls.name_col = FIMO_MOTIF - cls.score_col = FIMO_SCORE - cls.chromosome_col = FIMO_CHROMOSOME - cls.start_col = FIMO_START - cls.stop_col = FIMO_STOP - - cls._motif_file_type = 'fimo' - cls.scanner = FIMOScanner - - @classmethod - def set_type_homer(cls): - cls.name_col = HOMER_MOTIF - cls.score_col = HOMER_SCORE - cls.chromosome_col = HOMER_CHROMOSOME - cls.start_col = HOMER_START - cls.stop_col = HOMER_STOP - - cls._motif_file_type = 'homer' - cls.scanner = HOMERScanner - - @classmethod - def load_motif_file(cls, motif_file_name): - if motif_file_name.lower().endswith(".meme"): - return meme.read(motif_file_name) - else: - return homer_motif.read(motif_file_name) - diff --git a/srrTomat0/motifs/pwm.py b/srrTomat0/motifs/pwm.py deleted file mode 100644 index 1bff466..0000000 --- a/srrTomat0/motifs/pwm.py +++ /dev/null @@ -1,46 +0,0 @@ -from srrTomat0.motifs import Motif, MOTIF_COL - -import pandas as pd -import pandas.errors as pde -import os - -TF_NAME_COL = "TF_Name" -TF_STATUS_COL = "TF_Status" - - -def read(pwm_file_list, info_file, background=None, direct_only=False): - - info_df = pd.read_csv(info_file, sep="\t") - motifs = [] - - for pwm_file in pwm_file_list: - pwm_id = os.path.splitext(os.path.basename(pwm_file))[0] - - if direct_only: - direct = info_df.loc[info_df[MOTIF_COL] == pwm_id, TF_STATUS_COL].str.contains("D") - if not direct.any(): - continue - else: - pwm_names = info_df.loc[(info_df[MOTIF_COL] == pwm_id) & (info_df[TF_STATUS_COL] == "D"), TF_NAME_COL] - else: - pwm_names = info_df.loc[info_df[MOTIF_COL] == pwm_id, TF_NAME_COL] - - pwm_name = "/".join(pwm_names) - - try: - pwm = pd.read_csv(pwm_file, sep="\t", index_col=0) - except pde.ParserError: - print("Parser error on file {f}".format(f=pwm_name)) - continue - - pwm_alphabet = pwm.columns.tolist() - - motif = Motif(pwm_id, pwm_name, pwm_alphabet, motif_background=background) - motif.probability_matrix = pwm.values - - if min(pwm.values.shape) == 0: - continue - - motifs.append(motif) - - return motifs diff --git a/srrTomat0/motifs/transfac.py b/srrTomat0/motifs/transfac.py deleted file mode 100644 index 9afbc86..0000000 --- a/srrTomat0/motifs/transfac.py +++ /dev/null @@ -1,90 +0,0 @@ -from srrTomat0.motifs import Motif - -import numpy as np - -TRANSFAC_CODES = {"AC": "Accession", - "ID": "ID", - "NA": "Name", - "DT": "Date", - "CO": "Copyright", - "DE": "Description", - "TY": "Type", - "OS": "", - "OL": "", - "BF": "Species", - "P0": "Alphabet", - "SR": "", - "BA": "", - "CC": "", - "PR": "Profile"} - - -def read(file_descript): - - # Parse if it's a string - if isinstance(file_descript, str): - with open(file_descript) as motif_fh: - return [m for m in _parse_transfac_file(motif_fh)] - - # Parse if it's a file handle - else: - return [m for m in _parse_transfac_file(file_descript)] - - -def _parse_transfac_file(transfac_fh): - return [m for m in __parse_motif_gen(transfac_fh)] - - -def __parse_motif_gen(handle): - - active_motif = Motif() - - for line in handle: - line = line.strip() - - if len(line) < 2: - continue - - line_id, line = line[:2].upper(), line[2:].strip() - - # Spacer - if line_id == "XX": - continue - - # New record - elif line_id == "//" and len(active_motif) > 0: - yield active_motif - active_motif = Motif() - - elif line_id == "//": - active_motif = Motif() - - # Accession - elif line_id == "AC": - active_motif.accession = line - - # ID - elif line_id == "ID": - active_motif.motif_id = line - - # Name - elif line_id == "NA": - active_motif.motif_name = line - - # Alphabet - elif line_id == "P0": - active_motif.alphabet = line.split() - - elif line_id == "BF": - active_motif.species = line - - # Prob - elif line_id.isdigit(): - counts = list(map(float, line.split()[:-1])) - active_motif.add_count_line(counts) - total_seqs = sum(counts) - active_motif.add_prob_line(list(map(lambda x: x / total_seqs, counts))) - - if len(active_motif) > 0: - yield active_motif - diff --git a/srrTomat0/processor/__init__.py b/srrTomat0/processor/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/srrTomat0/processor/bedtools.py b/srrTomat0/processor/bedtools.py deleted file mode 100644 index 55668c1..0000000 --- a/srrTomat0/processor/bedtools.py +++ /dev/null @@ -1,123 +0,0 @@ -from srrTomat0.processor.gtf import GTF_CHROMOSOME, GTF_GENENAME, SEQ_START, SEQ_STOP, GTF_STRAND -import pandas as pd -import pybedtools -import os -import subprocess -import tempfile - -BEDTOOLS_EXTRACT_SUFFIX = ".extract.fasta" - -# Column names -BED_CHROMOSOME = 'chrom' - -SEQ_COUNTS = 'count' -SEQ_BIN = 'bin' -SEQ_SCORE = 'p-value' - - -def get_peaks_in_features(feature_dataframe, peak_dataframe, feature_group_column=GTF_CHROMOSOME, - peak_group_column=BED_CHROMOSOME): - genes = feature_dataframe.copy() - - # Add counts (and set to 0) - genes[SEQ_COUNTS] = 0 - - # Group genes and peaks by chromosome - - genes = {val: df for val, df in genes.groupby(feature_group_column)} - peaks = {val: df for val, df in peak_dataframe.groupby(peak_group_column)} - - chromosomes = set(genes.keys()).intersection(set(peaks.keys())) - - # Count overlaps on a per-chromosome basis - gene_counts = [] - for chromosome in chromosomes: - - # Function to return the number of overlaps with peaks in `chip_peaks` - # Iterates over genes from GTF data frame (using apply) - def _find_overlap(x): - start_bool = x[SEQ_START] <= peaks[chromosome][SEQ_STOP] - stop_bool = x[SEQ_STOP] >= peaks[chromosome][SEQ_START] - if sum(start_bool & stop_bool) == 0: - return 0 - selected_peaks = peaks[chromosome].loc[start_bool & stop_bool, :].copy() - selected_peaks.loc[selected_peaks[SEQ_START] < x[SEQ_START], SEQ_START] = x[SEQ_START] - selected_peaks.loc[selected_peaks[SEQ_STOP] > x[SEQ_STOP], SEQ_STOP] = x[SEQ_STOP] - return sum(selected_peaks[SEQ_STOP] - selected_peaks[SEQ_START]) - - # Add a chromosome column and then process into an integer peak count - genes[chromosome][feature_group_column] = chromosome - genes[chromosome][SEQ_COUNTS] = genes[chromosome].apply(_find_overlap, axis=1) - gene_counts.append(genes[chromosome]) - - # Combine all - gene_counts = pd.concat(gene_counts).reset_index().loc[:, [GTF_GENENAME, SEQ_COUNTS]] - - return gene_counts - - -def load_bed_to_dataframe(bed_file_path, **kwargs): - """ - :param bed_file_path: str - :return: pd.DataFrame - """ - - return pd.read_csv(bed_file_path, sep="\t", index_col=None, **kwargs) - - -def extract_bed_sequence(bed_file, genome_fasta, output_path=None): - output_path = tempfile.gettempdir() if output_path is None else output_path - output_file = os.path.join(output_path, os.path.split(genome_fasta)[1] + BEDTOOLS_EXTRACT_SUFFIX) - - if not isinstance(bed_file, pybedtools.BedTool): - bed_file = pybedtools.BedTool(bed_file) - - try: - bed_file.sequence(fi=genome_fasta, fo=output_file) - except pybedtools.helpers.BEDToolsError as pbe: - print(pbe.msg) - - return output_file - - -def load_bed_to_bedtools(bed): - if bed is None: - return None - elif isinstance(bed, pd.DataFrame): - return pybedtools.BedTool.from_dataframe(bed) - else: - return pybedtools.BedTool(bed) - - -def intersect_bed(*beds): - - if len(beds) == 1: - return beds[0] - - beds = [b.sort() for b in beds] - return beds[0].intersect(beds[1:], sorted=True) - - -def _merge_peaks_with_bedtools(merge_data, merge_columns, merge_function_names, max_distance=0): - """ - :param merge_data: pd.DataFrame - :param merge_columns: list(int) - :param merge_function_names: list(str) - :return: - """ - - assert len(merge_columns) == len(merge_function_names) - assert len(merge_columns) + 3 == merge_data.shape[1] - - # Load the data into a BedTool object - pbt_data = pybedtools.BedTool.from_dataframe(merge_data).sort() - - if len(merge_columns) > 0: - # Merge the overlapping peaks - pbt_data = pbt_data.merge(d=max_distance, c=merge_columns, o=merge_function_names).to_dataframe(max_distance) - else: - pbt_data = pbt_data.merge(d=max_distance).to_dataframe() - - pbt_data.columns = merge_data.columns - return pbt_data - diff --git a/srrTomat0/processor/chroma.py b/srrTomat0/processor/chroma.py deleted file mode 100644 index f526985..0000000 --- a/srrTomat0/processor/chroma.py +++ /dev/null @@ -1,13 +0,0 @@ -import asyncio -import os -import sys - -from srrTomat0.processor.utils import file_path_abs -from srrTomat0 import CHROMA_EXECUTABLE_PATH - -CHROMA_OUTPUT_FILE_EXTENSION = ".out.bed" -CHROMA_EXECUTABLE_CALL = [CHROMA_EXECUTABLE_PATH] - - -def chroma_process(): - pass diff --git a/srrTomat0/processor/gtf.py b/srrTomat0/processor/gtf.py deleted file mode 100644 index 81c3657..0000000 --- a/srrTomat0/processor/gtf.py +++ /dev/null @@ -1,144 +0,0 @@ -import warnings -import pybedtools -import pandas as pd - -GENE_ID_REGEX = 'gene_id\s\"([A-Za-z0-9\.\-\(\)]+)\"\;' - -# Column names -GTF_ATTRIBUTES = 'attributes' -GTF_CHROMOSOME = 'seqname' -GTF_GENENAME = 'gene_name' -GTF_STRAND = 'strand' -SEQ_START = 'start' -SEQ_STOP = 'end' -SEQ_TSS = 'TSS' - - -def load_gtf_to_dataframe(gtf_path): - """ - Loads genes from a GTF into a dataframe and returns them - :param gtf_path: str - :return annotations: pd.DataFrame [N x 5] - 'gene_name': str - 'strand': str - 'start': int - 'end': int - 'seqname': str - """ - - # Load annotations into a dataframe with pybedtools - annotations = pybedtools.BedTool(gtf_path).to_dataframe() - - # Drop anything with NaNs which were probably comment lines - annotations = annotations.loc[~pd.isnull(annotations[SEQ_START]) & ~pd.isnull(annotations[SEQ_STOP]), :] - - # Regex extract the gene_id from the annotations column - annotations[GTF_GENENAME] = annotations[GTF_ATTRIBUTES].str.extract(GENE_ID_REGEX, expand=False) - - # Define genes as going from the minimum start for any subfeature to the maximum end for any subfeature - annotations = _fix_genes(annotations) - - # Fix chromosome names to always be strings - annotations[GTF_CHROMOSOME] = annotations[GTF_CHROMOSOME].astype(str) - - return _add_TSS(annotations) - - -def open_window(annotation_dataframe, window_size, use_tss=False, check_against_fasta=None): - """ - This needs to adjust the start and stop in the annotation dataframe with window sizes - :param annotation_dataframe: pd.DataFrame - :param window_size: int - :param use_tss: bool - :param check_against_fasta: - :return window_annotate: pd.DataFrame - """ - window_annotate = annotation_dataframe.copy() - - try: - if len(window_size) == 1: - w_up, w_down = window_size[0], window_size[0] - elif len(window_size) == 2: - w_up, w_down = window_size[0], window_size[1] - else: - raise ValueError("window_size must have 1 or 2 values only") - except TypeError: - w_up, w_down = window_size, window_size - - if use_tss: - window_annotate.loc[window_annotate[GTF_STRAND] == "+", SEQ_START] = window_annotate[SEQ_TSS] - w_up - window_annotate.loc[window_annotate[GTF_STRAND] == "+", SEQ_STOP] = window_annotate[SEQ_TSS] + w_down - window_annotate.loc[window_annotate[GTF_STRAND] == "-", SEQ_START] = window_annotate[SEQ_TSS] - w_down - window_annotate.loc[window_annotate[GTF_STRAND] == "-", SEQ_STOP] = window_annotate[SEQ_TSS] + w_up - else: - window_annotate.loc[window_annotate[GTF_STRAND] == "+", SEQ_START] = window_annotate[SEQ_START] - w_up - window_annotate.loc[window_annotate[GTF_STRAND] == "+", SEQ_STOP] = window_annotate[SEQ_STOP] + w_down - window_annotate.loc[window_annotate[GTF_STRAND] == "-", SEQ_START] = window_annotate[SEQ_START] - w_down - window_annotate.loc[window_annotate[GTF_STRAND] == "-", SEQ_STOP] = window_annotate[SEQ_STOP] + w_up - - window_annotate.loc[window_annotate[SEQ_START] < 0, SEQ_START] = 0 - - if check_against_fasta is not None: - fasta_len = {} - with open(check_against_fasta, mode="r") as fasta_fh: - current_record = None - for line in fasta_fh: - if line.startswith(">"): - current_record = line[1:].split()[0] - fasta_len[current_record] = 0 - else: - fasta_len[current_record] += len(line.strip()) - - _gtf_chromosomes = set(window_annotate[GTF_CHROMOSOME].unique()) - _fasta_chromsomes = set(fasta_len.keys()) - _gtf_fasta_match = _gtf_chromosomes.intersection(_fasta_chromsomes) - - if len(_gtf_fasta_match) != len(_gtf_chromosomes): - _msg = "GTF File Chromosomes {g} do not match FASTA File Chromosomes {f}\n" - _msg += "The following chromosomes will not map correctly: {ft}" - _msg = _msg.format(g=_gtf_chromosomes, - f=_fasta_chromsomes, - ft=_gtf_chromosomes.symmetric_difference(_fasta_chromsomes)) - warnings.warn(_msg) - - if len(_gtf_fasta_match) == 0: - raise ValueError("Unable to map FASTA and GTF chromosomes together") - - for chromosome in _gtf_fasta_match: - _chrlen = fasta_len[chromosome] - _idx = window_annotate[GTF_CHROMOSOME] == chromosome - window_annotate.loc[_idx & (window_annotate[SEQ_STOP] > _chrlen), SEQ_STOP] = _chrlen - window_annotate.loc[_idx & (window_annotate[SEQ_START] > _chrlen), SEQ_START] = _chrlen - - return window_annotate - - -def _fix_genes(gene_dataframe): - """ - Find minimum start and maximum stop - :param gene_dataframe: pd.DataFrame - :return: - """ - - # Make sure that the strandedness doesn't reverse start/stop - assert (gene_dataframe[SEQ_START] <= gene_dataframe[SEQ_STOP]).all() - - def _most_common(x): - return x.value_counts().index[0] - - # Define the functions for aggregating gene records - aggregate_functions = {SEQ_START: min, SEQ_STOP: max, GTF_CHROMOSOME: _most_common, GTF_STRAND: _most_common} - - return gene_dataframe.groupby("gene_name").aggregate(aggregate_functions).reset_index() - - -def _add_TSS(gene_dataframe): - """ - Add a TSS column in place - :param gene_dataframe: pd.DataFrame - :return: - """ - gene_dataframe[SEQ_TSS] = gene_dataframe[SEQ_START].copy() - rev_strand = gene_dataframe[GTF_STRAND] == "-" - gene_dataframe.loc[rev_strand, SEQ_TSS] = gene_dataframe.loc[rev_strand, SEQ_STOP].copy() - return gene_dataframe diff --git a/srrTomat0/processor/htseq_count.py b/srrTomat0/processor/htseq_count.py deleted file mode 100644 index ad0c9d2..0000000 --- a/srrTomat0/processor/htseq_count.py +++ /dev/null @@ -1,88 +0,0 @@ -import asyncio -import os -import sys - -from srrTomat0 import HTSEQ_MODULE_NAME -from srrTomat0.processor.utils import file_path_abs - -HTSEQ_COUNT_FILE_EXTENSION = ".count.out" -HTSEQ_EXECUTABLE_CALL = [str(sys.executable), "-m", HTSEQ_MODULE_NAME] - - -def htseq_count_aligned(srr_ids, sam_file_names, annotation_file, output_path, num_workers=5): - """ - - :param srr_ids: list(str) - NCBI SRR ID string - :param sam_file_names: list(list(str)) - A list of complete SAM file names that were aligned from the FASTQ - :param annotation_file: str - A path to the GTF/GFF annotation file - :param output_path: str - The path to put the output alignment files - :param num_workers: int - Number of separate simultaneous jobs to run - :return: - """ - - sem = asyncio.Semaphore(num_workers) - - # Build HTseq.count tasks - tasks = [_htseq_count(sid, samfn, annotation_file, output_path, sem) - for sid, samfn in zip(srr_ids, sam_file_names)] - - # Run and return STAR tasks - return asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks)) - - -# TODO: test this -async def _htseq_count(srr_id, sam_file_name, annotation_file_name, output_path, semaphore): - """ - :param srr_id: str - NCBI SRR ID string - :param sam_file_name: str - Path to the aligned SAM file - :param annotation_file_name: str - A path to the GTF/GFF annotations - :param output_path: str - A path to the output - :param semaphore: asyncio.Semaphore - Semaphore for resource utilization - :return output_file: str - The path to the GeneCount file generated by HTCount - """ - - async with semaphore: - - if sam_file_name is None: - return None - - try: - os.makedirs(output_path) - except FileExistsError: - pass - - output_file = os.path.join(file_path_abs(output_path), srr_id + HTSEQ_COUNT_FILE_EXTENSION) - - if os.path.exists(output_file): - print("{id} countfile exists ({path})".format(id=srr_id, path=output_path)) - return output_file - - # Call out to an external interpreter - htseq_call = [*HTSEQ_EXECUTABLE_CALL, "--stranded=no", sam_file_name, annotation_file_name] - - print(" ".join(htseq_call)) - process = await asyncio.create_subprocess_exec(*htseq_call, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE) - - (output_data, output_err) = await process.communicate() - - if int(process.returncode) != 0: - print("HTSeq.count failed for {id} ({file})".format(id=srr_id, file=sam_file_name)) - return None - - with open(output_file, mode="w") as out_fh: - print(output_data.decode('utf-8'), file=out_fh) - - return output_file diff --git a/srrTomat0/processor/matrix.py b/srrTomat0/processor/matrix.py deleted file mode 100644 index 2856221..0000000 --- a/srrTomat0/processor/matrix.py +++ /dev/null @@ -1,176 +0,0 @@ -import HTSeq -import pandas as pd - -INDEX_NAME = "gene" -COUNT_COLUMN = "count" - -META_STARTSWITH_FLAG = "__" -META_ALIGNED_COUNTS = "aligned_feature_sum" - -TRANSCRIPT_TYPE_FLAG = "exon" - - -# Turn count files into a count matrix -# TODO: test this -def pileup_raw_counts(srr_ids, count_files): - """ - Convert the HTSeq count files to a dataframe of SRR-derived expression values - - :param srr_ids: list(str) - NCBI SRR ID string - :param count_files: list(str) - A list of HTSeq count files - :return matrix_data: pd.DataFrame [Genes x Samples] - A dataframe of raw, unnormalized count values from all SRR alignments - """ - matrix_data = pd.DataFrame() - meta_data = [] - for srr_id, count_file_name in zip(srr_ids, count_files): - - if count_file_name is None: - continue - - # Load in the count data - count_data = pd.read_csv(count_file_name, sep="\t", index_col=0, header=None) - count_data.index.name = INDEX_NAME - count_data.columns = [COUNT_COLUMN] - - # Find the metadata - count_metadata_indexes = count_data.index[count_data.index.str.startswith(META_STARTSWITH_FLAG)] - - # Process metadata - count_meta_data = count_data.loc[count_metadata_indexes, :].rename(columns={COUNT_COLUMN: srr_id}).transpose() - count_meta_data.columns = count_meta_data.columns.str.strip(META_STARTSWITH_FLAG) - - # Remove metadata from count dataframe - count_data = count_data.drop(count_metadata_indexes) - - # Make sure that the pileup matrix has all the genes - if len(count_data.index.difference(matrix_data.index)) > 0: - new_index = matrix_data.index.union(count_data.index) - matrix_data = matrix_data.reindex(new_index) - matrix_data[pd.isna(matrix_data)] = 0 - - # Make sure that the count data is aligned to the matrix data - if not count_data.index.equals(matrix_data.index): - count_data = count_data.reindex(matrix_data.index) - count_data[pd.isna(count_data)] = 0 - - # Stick the count data onto the data frame - count_data = count_data.reindex(matrix_data.index) - matrix_data[srr_id] = count_data[COUNT_COLUMN].astype(int) - - # Add the total counts to the metadata - count_meta_data[META_ALIGNED_COUNTS] = count_data[COUNT_COLUMN].sum() - meta_data.append(count_meta_data) - - # Combine the meta_data into a single dataframe - meta_data = pd.concat(meta_data) - - return matrix_data, meta_data - - -# Turn a raw read count into a normalized RPKM / FPKM per gene -def normalize_matrix_to_fpkm(matrix_data, annotation_file): - """ - Convert a raw count dataframe to a library and gene size normalized dataframe (RPKM / FPKM) - - :param matrix_data: pd.DataFrame [Genes x Samples] - Dataframe of raw counts per gene - :param annotation_file: str - Path to the genome annotation (GTF) file - :return normalized_matrix: pd.DataFrame [Genes x Samples] - Normalized dataframe (FPKM) - """ - - gene_lengths = load_gene_lengths(annotation_file) - - diff = matrix_data.index.difference(gene_lengths.index) - if len(diff) > 0: - print("Dropping genes with unknown lengths: {genes}".format(genes=" ".join(diff.tolist()))) - - normalized_matrix = matrix_data.drop(diff, axis=0) - - # Normalize the libraries by read depth to counts per million reads - normalized_matrix = normalized_matrix.divide(normalized_matrix.sum()) * 1e6 - - # Normalize the libraries by gene length to counts per kilobase per million reads - normalized_matrix = normalized_matrix.divide(gene_lengths['length'], axis=0) - - return normalized_matrix - - -# Turn a raw read count into a normalized TPM per gene -def normalize_matrix_to_tpm(matrix_data, annotation_file): - """ - Convert a raw count dataframe to a library and gene size normalized dataframe (TPM) - - :param matrix_data: pd.DataFrame [Genes x Samples] - Dataframe of raw counts per gene - :param annotation_file: str - Path to the genome annotation (GTF) file - :return normalized_matrix: pd.DataFrame [Genes x Samples] - Normalized dataframe (TPM) - """ - - gene_lengths = load_gene_lengths(annotation_file) - - diff = matrix_data.index.difference(gene_lengths.index) - if len(diff) > 0: - print("Dropping genes with unknown lengths: {genes}".format(genes=" ".join(diff.tolist()))) - - # Align data - normalized_matrix = matrix_data.drop(diff, axis=0) - gene_lengths = gene_lengths.reindex(normalized_matrix.index) - - # Normalize the libraries by gene length to counts per kilobase - normalized_matrix = normalized_matrix.divide(gene_lengths['length'], axis=0) - - # Normalize the libraries by scaling to the library size - normalized_matrix = normalized_matrix.divide(normalized_matrix.sum()) * 1e6 - - return normalized_matrix - - -def load_gene_lengths(annotation_file): - """ - Load gene lengths from an annotation file - - :param annotation_file: str - Path to the genome annotation (GTF) file - :return gene_lengths: pd.DataFrame[G x 1] - Dataframe indexed by gene name - - ========== ======= ============================================================== - length int sum of exon length in kilobases - ========== ======= ============================================================== - - """ - - # Load a GFF reader from HTSeq - gff_reader = HTSeq.GFF_Reader(annotation_file) - - # Get exons for each gene - gene_lengths = {} - for gf in gff_reader: - if gf.type == TRANSCRIPT_TYPE_FLAG: - try: - gene_lengths[gf.name].append(_gene_length(gf)) - except KeyError: - gene_lengths[gf.name] = [_gene_length(gf)] - - # Sum exon lengths and pack into a dataframe in kilobases - gene_lengths = pd.DataFrame.from_dict({gn: sum(exons) / 10e3 for gn, exons in gene_lengths.items()}, - orient='index', columns=['length']) - - return gene_lengths - -def _gene_length(htseq_genomic_feature): - """ - Get feature length - :param htseq_genomic_feature: HTSeq.GenomeFeature - GenomeFeature from a GFF_Reader iterable - :return: int - Feature length - """ - return abs(htseq_genomic_feature.iv.start - htseq_genomic_feature.iv.end) diff --git a/srrTomat0/processor/prior.py b/srrTomat0/processor/prior.py deleted file mode 100644 index 6cc8898..0000000 --- a/srrTomat0/processor/prior.py +++ /dev/null @@ -1,364 +0,0 @@ -from srrTomat0.processor.gtf import GTF_GENENAME, GTF_CHROMOSOME, SEQ_START, SEQ_STOP -from srrTomat0.motifs.motif_scan import MotifScan -from srrTomat0.motifs import INFO_COL, MOTIF_COL, LEN_COL, SCAN_SCORE_COL, MOTIF_NAME_COL, SCORE_PER_BASE - -import pandas as pd -import pandas.api.types as pat -import numpy as np -import pathos.multiprocessing as multiprocessing -from sklearn.cluster import DBSCAN -from sklearn.covariance import EllipticEnvelope -from scipy.ndimage.filters import uniform_filter1d - -PRIOR_TF = 'regulator' -PRIOR_GENE = 'target' -PRIOR_COUNT = 'count' -PRIOR_SCORE = 'score' -PRIOR_MOTIF_IC = 'motif_ic' -PRIOR_PVAL = 'pvalue' -PRIOR_SEQ = 'sequence' -PRIOR_START = 'start' -PRIOR_STOP = 'stop' -PRIOR_CHR = 'chromosome' - -PRIOR_COLS = [PRIOR_TF, PRIOR_GENE, PRIOR_COUNT, PRIOR_SCORE, PRIOR_MOTIF_IC, PRIOR_START, PRIOR_STOP, PRIOR_CHR] - -PRIOR_FDR = 'qvalue' -PRIOR_SIG = 'significance' - -MINIMUM_MOTIF_IC_BITS = None -MAXIMUM_TANDEM_DISTANCE = 100 - - -class MotifScorer: - min_binding_ic = MINIMUM_MOTIF_IC_BITS - max_dist = MAXIMUM_TANDEM_DISTANCE - - @classmethod - def set_information_criteria(cls, min_binding_ic=None, max_dist=None): - """ - Set parameters for - :param min_binding_ic: - :param max_dist: - :return: - """ - cls.min_binding_ic = cls.min_binding_ic if min_binding_ic is None else min_binding_ic - cls.max_dist = cls.max_dist if max_dist is None else max_dist - - @classmethod - def score_tf(cls, tf_motifs): - """ - Score a single TF - :param tf_motifs: Motif binding sites from FIMO/HOMER - :type tf_motifs: pd.DataFrame - :return: Score if the TF should be kept, None otherwise - """ - - assert isinstance(tf_motifs, pd.DataFrame) - - # Drop sites that don't meet threshold - if cls.min_binding_ic is not None: - tf_motifs = tf_motifs.loc[tf_motifs[SCAN_SCORE_COL] >= cls.min_binding_ic, :] - n_sites = tf_motifs.shape[0] - - # If there's no data return None - if n_sites == 0: - return None - - # Sort and check for overlapping motifs - tf_motifs = tf_motifs.sort_values(by=MotifScan.start_col) - overlap = tf_motifs[MotifScan.start_col] < tf_motifs[MotifScan.stop_col].shift() - - # Collapse together any overlapping motifs to the maximum score on a per-base basis - if overlap.any(): - - tf_motifs["GROUP"] = (~overlap).cumsum() - tf_motifs = pd.concat([cls._agg_per_base(group) for _, group in tf_motifs.groupby("GROUP")]) - - n_sites = tf_motifs.shape[0] - - # If there's only one site check it and then return - if n_sites == 1: - return cls._top_hit(tf_motifs) - - # If there's only two sites check it and then return - if n_sites == 2: - consider_tandem = tf_motifs.iloc[0, :][MotifScan.stop_col] - tf_motifs.iloc[1, :][MotifScan.start_col] - if consider_tandem > cls.max_dist: - return cls._top_hit(tf_motifs) - else: - start = tf_motifs.iloc[0, :][MotifScan.start_col] - stop = tf_motifs.iloc[1, :][MotifScan.stop_col] - score = tf_motifs[SCAN_SCORE_COL].sum() - return score, 2, start, stop - - # If there's more than two sites do the complicated tandem checking stuff - else: - # Find things that are in tandems - consider_tandem = (tf_motifs[MotifScan.stop_col] - tf_motifs[MotifScan.start_col].shift(1)) - consider_tandem = consider_tandem <= cls.max_dist - - # Skip the rest if nothing is close enough to matter - if not consider_tandem.any(): - return cls._top_hit(tf_motifs) - - # Ffill the tandem group to have the same start - tandem_starts = tf_motifs[MotifScan.start_col].copy() - tandem_starts.loc[consider_tandem] = pd.NA - tandem_starts = tandem_starts.ffill() - - # Backfill the tandem group to have the same stop - tandem_stops = tf_motifs[MotifScan.stop_col].copy() - tandem_stops.loc[consider_tandem.shift(-1, fill_value=False)] = pd.NA - tandem_stops = tandem_stops.bfill() - - # Concat, group by start/stop, and then sum IC scores - tandem_peaks = pd.concat([tandem_starts, tandem_stops, tf_motifs[SCAN_SCORE_COL]], axis=1) - tandem_peaks.columns = [PRIOR_START, PRIOR_STOP, PRIOR_SCORE] - tandem_peaks = tandem_peaks.groupby(by=[PRIOR_START, PRIOR_STOP]).agg('sum').reset_index() - - # Return the highest tandem array group - peak = tandem_peaks.loc[tandem_peaks[PRIOR_SCORE].argmax(), :] - return peak[PRIOR_SCORE], peak.shape[0], peak[PRIOR_START], peak[PRIOR_STOP] - - @classmethod - def preprocess_motifs(cls, gene_motif_data, motif_information): - if cls.min_binding_ic is not None: - motif_information = motif_information.loc[motif_information[INFO_COL] >= cls.min_binding_ic, :] - keeper_motifs = motif_information[MOTIF_COL].unique().tolist() - keeper_idx = (gene_motif_data[MotifScan.name_col].isin(keeper_motifs)) - keeper_idx &= (gene_motif_data[SCAN_SCORE_COL] >= cls.min_binding_ic) - - return gene_motif_data.loc[keeper_idx, :], motif_information - else: - return gene_motif_data, motif_information - - @staticmethod - def _top_hit(tf_motifs): - if tf_motifs.shape[0] == 0: - return None - elif tf_motifs.shape[0] == 1: - top_hit = tf_motifs.iloc[0, :] - else: - top_hit = tf_motifs.iloc[tf_motifs[SCAN_SCORE_COL].values.argmax(), :] - - start = MotifScorer._first_value(top_hit[MotifScan.start_col]) - stop = MotifScorer._first_value(top_hit[MotifScan.stop_col]) - score = MotifScorer._first_value(top_hit[SCAN_SCORE_COL]) - return score, 1, start, stop - - @staticmethod - def _first_value(series): - try: - return series.iloc[0] - except AttributeError: - return series - - @classmethod - def _agg_per_base(cls, overlap_df): - """ - Aggregate an overlapping set of motif peaks by summing the maximum per-base IC for each base - :param overlap_df: - :return: - """ - if len(overlap_df) == 1: - return overlap_df[[MotifScan.start_col, MotifScan.stop_col, SCAN_SCORE_COL, MOTIF_NAME_COL]] - - overlap_df.reset_index(inplace=True) - - new_df = [(a, b) for i in overlap_df.index for a, b in zip(range(overlap_df.loc[i, MotifScan.start_col], - overlap_df.loc[i, MotifScan.stop_col]), - overlap_df.loc[i, SCORE_PER_BASE])] - - return pd.DataFrame({MotifScan.start_col: [overlap_df[MotifScan.start_col].min()], - MotifScan.stop_col: [overlap_df[MotifScan.stop_col].max()], - SCAN_SCORE_COL: pd.DataFrame(new_df, columns=["B", "S"]).groupby("B").agg('max').sum(), - MOTIF_NAME_COL: [overlap_df[MOTIF_NAME_COL].unique()[0]]}) - - -def build_prior_from_atac_motifs(genes, motif_peaks, motif_information, num_workers=1, seed=42): - """ - Construct a prior [G x K] interaction matrix - :param genes: pd.DataFrame [G x n] - :param motif_peaks: pd.DataFrame - Motif search data loaded from FIMO or HOMER - :param motif_information: pd.DataFrame [n x 5] - Motif characteristics loaded from a MEME file - :return prior_data, prior_matrix: pd.DataFrame [G*K x 6], pd.DataFrame [G x K] - A long-form edge table data frame and a wide-form interaction matrix data frame - """ - - motif_ids = motif_information[MOTIF_COL].unique() - print("Building prior from {g} genes and {k} Motifs".format(g=genes.shape[0], k=len(motif_ids))) - - motif_peaks, motif_information = MotifScorer.preprocess_motifs(motif_peaks, motif_information) - print("Preliminary search identified {n} binding sites".format(n=motif_peaks.shape[0])) - - # Trim down the motif dataframe and put it into a dict by chromosome - motif_peaks = motif_peaks.reindex([MotifScan.name_col, MotifScan.chromosome_col, MotifScan.start_col, - MotifScan.stop_col, SCAN_SCORE_COL, SCORE_PER_BASE], axis=1) - - motif_id_to_name = motif_information.reindex([MOTIF_COL, MOTIF_NAME_COL], axis=1) - invalid_names = (pd.isnull(motif_id_to_name[MOTIF_NAME_COL]) | - (motif_id_to_name[MOTIF_NAME_COL] == "") | - (motif_id_to_name is None)) - - motif_id_to_name.loc[invalid_names, MOTIF_NAME_COL] = motif_id_to_name.loc[invalid_names, MOTIF_COL] - motif_peaks = motif_peaks.join(motif_id_to_name.set_index(MOTIF_COL, verify_integrity=True), on=MotifScan.name_col) - motif_names = motif_information[MOTIF_NAME_COL].unique() - - motif_peaks = {chromosome: df for chromosome, df in motif_peaks.groupby(MotifScan.chromosome_col)} - - def _prior_mapper(data): - i, gene_data, motifs = data - return _build_prior_for_gene(gene_data, motifs, motif_information, i) - - if num_workers == 1: - prior_data = list(map(_prior_mapper, _gene_gen(genes, motif_peaks))) - else: - with multiprocessing.Pool(num_workers, maxtasksperchild=1000) as pool: - prior_data = pool.map(_prior_mapper, _gene_gen(genes, motif_peaks), chunksize=20) - - # Combine priors for all genes - prior_data = pd.concat(prior_data).reset_index(drop=True) - prior_data[PRIOR_START] = prior_data[PRIOR_START].astype(int) - prior_data[PRIOR_STOP] = prior_data[PRIOR_STOP].astype(int) - - np.random.seed(seed) - - # Pivot to a matrix, extend to all TFs, and fill with 0s - raw_matrix = prior_data.pivot(index=PRIOR_GENE, columns=PRIOR_TF, values=PRIOR_SCORE) - raw_matrix = raw_matrix.reindex(motif_names, axis=1).reindex(genes[GTF_GENENAME], axis=0).fillna(0) - raw_matrix.index.name = PRIOR_GENE - - prior_matrix = raw_matrix.copy() - # Threshold using DBSCAN outlier detection - for reg in prior_matrix.columns: - prior_matrix.loc[~_find_outliers_elliptic_envelope(prior_matrix[reg]), reg] = 0. - - # Keep the peaks that we want - thresholded_data = prior_matrix.reset_index().melt(id_vars=PRIOR_GENE, var_name=PRIOR_TF, value_name='T') - thresholded_data = prior_data.merge(thresholded_data, on=[PRIOR_GENE, PRIOR_TF]) - thresholded_data = thresholded_data.loc[thresholded_data['T'] != 0, :] - thresholded_data.drop('T', axis=1, inplace=True) - - return thresholded_data, prior_matrix, raw_matrix - - -def _gene_gen(genes, motif_peaks): - for i, (idx, gene_data) in enumerate(genes.iterrows()): - try: - gene_chr, gene_start, gene_stop = gene_data[GTF_CHROMOSOME], gene_data[SEQ_START], gene_data[SEQ_STOP] - - motif_data = motif_peaks[gene_data[GTF_CHROMOSOME]] - motif_mask = motif_data[MotifScan.stop_col] >= gene_start - motif_mask &= motif_data[MotifScan.start_col] <= gene_stop - motif_data = motif_data.loc[motif_mask, :].copy() - yield i, gene_data, motif_data - except KeyError: - continue - - -def _find_outliers_dbscan(tf_data, t_1=0.01, t_2=0.05): - scores = tf_data.values.reshape(-1, 1) - counts = tf_data.shape[0] - - labels = DBSCAN(min_samples=np.log2(counts), eps=scores.max() / 100).fit_predict(scores) - - # Keep any outliers (outliers near 0 should be discarded) - keep_edge = pd.Series((labels == -1) & (tf_data.values > np.mean(scores)), index=tf_data.index) - - # Iterate through clusters in reverse order until at least t_1 and no more than t_2 edges are included - for lab in np.unique(labels)[::-1]: - current_ratio = keep_edge.sum() / keep_edge.size - new_labels = labels == lab - if current_ratio > t_1: - break - elif current_ratio + (new_labels.sum() / new_labels.size) > t_2: - break - else: - keep_edge |= new_labels - - return keep_edge - - -def _find_outliers_elliptic_envelope(tf_data, outlier=2.5, skip_threshold=0.002): - - scores = tf_data.values - keep_genes = pd.Series(False, index=tf_data.index) - - if np.var(scores) == 0.: - return keep_genes - - _nz_idx = scores > 0 - - if skip_threshold is not None and np.sum(_nz_idx) < (len(keep_genes) * skip_threshold): - return keep_genes | _nz_idx - - # Calculate Mahalanobis distance - _nzs = scores[_nz_idx].reshape(-1, 1) - - if np.var(_nzs) == 0.: - return keep_genes - - # Correct for just the dumbest bug in scipy - if (np.mean(_nzs) == np.median(_nzs)) & (np.sum(_nzs == np.median(_nzs)) > (0.5 * _nzs.size)): - keep_genes[_nz_idx] = (_nzs >= np.median(_nzs)).flatten() - return keep_genes - - m_dist = EllipticEnvelope(support_fraction=1).fit(_nzs).score_samples(_nzs) - scaled_m_dist = (m_dist - np.mean(m_dist)) / np.std(m_dist) - keep_genes[_nz_idx] = scaled_m_dist < (-1 * outlier) - - return keep_genes - - -def _build_prior_for_gene(gene_info, motif_data, motif_information, num_iteration): - """ - Takes motifs identified by scan near a single gene and turns them into TF-gene scores - - :param gene_info: Gene information from annotations - :type gene_info: pd.DataFrame - :param motif_data: Motif locations near the gene - :type motif_data: pd.DataFrame - :param motif_information: Motif information - :type motif_information: pd.DataFrame - :param num_iteration: Number of genes which have been processed - :type num_iteration: int - :return prior_edges: pd.DataFrame [N x 5] - 'regulator': tf name - 'target': gene name - 'count': number of motifs found - 'score': information content-based score of binding site - 'motif_ic': information content score of motif - 'start': binding site start - 'stop': binding site stop - 'chromosome' binding site chromosome - """ - - gene_name = gene_info[GTF_GENENAME] - gene_chr, gene_start, gene_stop = gene_info[GTF_CHROMOSOME], gene_info[SEQ_START], gene_info[SEQ_STOP] - - if num_iteration % 100 == 0: - print("Processing gene {i} [{gn}]".format(i=num_iteration, gn=gene_name)) - - if min(motif_data.shape) == 0: - return pd.DataFrame(columns=PRIOR_COLS) - - prior_edges = [] - for tf, tf_peaks in motif_data.groupby(MOTIF_NAME_COL): - tf_info = motif_information.loc[motif_information[MOTIF_NAME_COL] == tf, :] - res = MotifScorer.score_tf(tf_peaks) - - # Unpack results if there is a hit - if res is None: - continue - else: - score, tf_counts, start, stop = res - - info = tf_info[INFO_COL].mean() if tf_info.shape[0] > 0 else np.nan - - # Add this edge to the table - prior_edges.append((tf, gene_name, tf_counts, score, info, start, stop, gene_chr)) - - return pd.DataFrame(prior_edges, columns=PRIOR_COLS) diff --git a/srrTomat0/processor/samtools.py b/srrTomat0/processor/samtools.py deleted file mode 100644 index 59ef483..0000000 --- a/srrTomat0/processor/samtools.py +++ /dev/null @@ -1,83 +0,0 @@ -import asyncio -import os - -from srrTomat0 import SAMTOOLS_EXECUTABLE_PATH - -BAM_EXTENSION = ".bam" - - -def sam_sort(srr_ids, sam_files, target_path, min_quality=None, num_workers=5): - """ - Sort (and filter) SAM files into BAM files - - :param srr_ids: list(str) - List of SRA IDs to acquire from NCBI - :param sam_files: list(str) - List of SAM file paths - :param target_path: str - Target path for the SRA files - :param min_quality: int - If set, filter reads for MINQ - :param num_workers: int - Number of concurrent jobs to run - :return: - """ - - sem = asyncio.Semaphore(num_workers) - - tasks = [_process_sam(sid, sfn, target_path, sem, min_quality=min_quality) for sid, sfn in zip(srr_ids, sam_files)] - - return asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks)) - - -async def _process_sam(srr_id, sam_file, target_path, semaphore, min_quality=None): - """ - Sort (and filter) SAM file into BAM file - :param srr_id: str - NCBI SRR ID string - :param sam_file: - The complete path to the SAM file - :param target_path: str - The path to put the BAM file - :param semaphore: asyncio.Semaphore - Semaphore for resource utilization - :param min_quality: int - Minimum alignment quality score to include (None disables filter) - :return bam_file_name: str - Path to the created BAM file name - """ - async with semaphore: - - if sam_file is None: - return None - - bam_file_name = os.path.join(target_path, srr_id + BAM_EXTENSION) - - samtools_sort_call = [SAMTOOLS_EXECUTABLE_PATH] - sort_cmd = ["sort", "-o", bam_file_name] - - # If min_quality is set, pipe in a view with a -q flag set - if min_quality is not None: - samtools_sort_call.extend(["view", "-q", str(min_quality), sam_file, "|", - SAMTOOLS_EXECUTABLE_PATH, *sort_cmd, "-"]) - else: - samtools_sort_call.extend([*sort_cmd, sam_file]) - - # Create a sorted BAM file - try: - print(" ".join(samtools_sort_call)) - process = await asyncio.create_subprocess_exec(*samtools_sort_call) - code = await process.wait() - except: - code = 1 - raise - finally: - if int(code) != 0: - print("samtools sort failed for {id} ({file})".format(id=srr_id, file=bam_file_name)) - try: - os.remove(bam_file_name) - except FileNotFoundError: - pass - return None - - return bam_file_name diff --git a/srrTomat0/processor/srr.py b/srrTomat0/processor/srr.py deleted file mode 100644 index 7e5f4bc..0000000 --- a/srrTomat0/processor/srr.py +++ /dev/null @@ -1,173 +0,0 @@ -import asyncio -import os - -from srrTomat0.processor.utils import file_path_abs -from srrTomat0 import FASTQDUMP_EXECUTABLE_PATH, PREFETCH_EXECUTABLE_PATH - -PREFETCH_OPTIONS = ["--max-size", "1000000000"] - -SRA_EXTENSION = ".sra" -POSSIBLE_FASTQ_EXTENSIONS = [".fastq.gz", "_1.fastq.gz", "_2.fastq.gz", "_3.fastq.gz", "_4.fastq.gz"] - - -# TODO: test this -def get_srr_files(srr_list, target_path, num_workers=5, prefetch_options=PREFETCH_OPTIONS): - """ - Take a list of SRR ID strings, download them async with num_workers concurrent jobs, and return a list of the - paths to the SRR files that have been downloaded. - :param srr_list: list(str) - List of SRA IDs to acquire from NCBI - :param target_path: str - Target path for the SRA files - :param num_workers: int - Number of concurrent jobs to run - :param prefetch_options: list(str) - Any additional command line arguments to pass to prefetch - :return: - """ - sem = asyncio.Semaphore(num_workers) - - srr_file_names = list(map(lambda x: os.path.join(file_path_abs(target_path), x + SRA_EXTENSION), srr_list)) - tasks = [_get_srr(sid, sfn, sem, prefetch_options=prefetch_options) for sid, sfn in zip(srr_list, srr_file_names)] - - try: - return asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks)) - except RuntimeError: - return asyncio.new_event_loop().run_until_complete(asyncio.gather(*tasks)) - - -# TODO: test this -async def _get_srr(srr_id, srr_file_name, semaphore, prefetch_options=PREFETCH_OPTIONS): - """ - Take a SRR ID string and get the SRR file for it from NCBI. - - :param srr_id: str - NCBI SRR ID string - :param srr_file_name: str - The path to the SRR file (the FULL path) - :param semaphore: asyncio.Semaphore - Semaphore for resource utilization - :param prefetch_options: list(str) - Any additional command line arguments to pass to prefetch - :return srr_file_name: str - The SRR file name (including path) - """ - async with semaphore: - # If the file is already downloaded, don't do anything - if os.path.exists(srr_file_name): - print("{id} exists in file {file}".format(id=srr_id, file=srr_file_name)) - return srr_file_name - - prefetch_call = [PREFETCH_EXECUTABLE_PATH, srr_id, "-o", srr_file_name, *prefetch_options] - print(" ".join(prefetch_call)) - process = await asyncio.create_subprocess_exec(*prefetch_call) - code = await process.wait() - - if int(code) != 0: - print("NCBI Prefetch failed for {id} ({file})".format(id=srr_id, file=srr_file_name)) - return None - - return srr_file_name - - -# TODO: test this -def unpack_srr_files(srr_ids, srr_file_names, target_path, num_workers=5): - """ - Take an SRR file and unpack it into a set of FASTQ files - - :param srr_ids: list(str) - NCBI SRR ID string - :param srr_file_names: list(str) - The complete path to the SRR file - :param target_path: str - The path to put the FASTQ file(s) - :param num_workers: int - Number of concurrent jobs to run - :return fastq_file_names: list - A list of complete FASTQ file names that were unpacked from the SRR file (including path) - """ - - sem = asyncio.Semaphore(num_workers) - - tasks = [_unpack_srr(sid, sfn, target_path, sem) for sid, sfn in zip(srr_ids, srr_file_names)] - return asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks)) - - -# TODO: test this -async def _unpack_srr(srr_id, srr_file_name, target_path, semaphore): - """ - - :param srr_id: str - NCBI SRR ID string - :param srr_file_name: str - The complete path to the SRR file - :param target_path: str - The path to put the FASTQ file(s) - :param semaphore: asyncio.Semaphore - Semaphore for resource utilization - :return: - """ - async with semaphore: - - if srr_file_name is None: - return [None] - - # Check and see if this has already been done - output_file_names = list(map(lambda x: os.path.join(file_path_abs(target_path), srr_id + x), - POSSIBLE_FASTQ_EXTENSIONS)) - files_created = check_list_of_files_exist(output_file_names) - - # If the file is already unpacked, don't do anything - if len(files_created) > 0: - print("{id} exists in path {path} ({files})".format(id=srr_id, path=target_path, - files=" ".join(files_created))) - return files_created - - # Build a fastq-dump call and execute it - fastq_dump_call = [FASTQDUMP_EXECUTABLE_PATH, "--gzip", "--split-files", "--outdir", target_path, - srr_file_name] - - print(" ".join(fastq_dump_call)) - - # Run fastq-dump and get the files that were created from it - return_code = 0 - try: - process = await asyncio.create_subprocess_exec(*fastq_dump_call) - return_code = await process.wait() - file_output = check_list_of_files_exist(output_file_names) - except: - return_code = 1 - file_output = [None] - raise - finally: - # If the fastq-dump failed, clean up the files associated with it and then move on - if int(return_code) != 0: - print("NCBI fastq-dump failed for {id} ({file})".format(id=srr_id, file=srr_file_name)) - files_created = check_list_of_files_exist(output_file_names) - for f in files_created: - try: - os.remove(f) - except FileNotFoundError: - pass - file_output = [None] - - # Find out which read files were created by looking into the output folder - return file_output - - -def check_list_of_files_exist(file_list): - """ - Check a list of file names and return subset of the list that exists (or an empty list if none exist) - :param file_list: list(str) - List of file names - :return existing_file_list: list(str) - List of files that exist - """ - - existing_file_list = [] - - for file_name in file_list: - if os.path.exists(file_path_abs(file_name)): - existing_file_list.append(file_name) - - return existing_file_list diff --git a/srrTomat0/processor/star.py b/srrTomat0/processor/star.py deleted file mode 100644 index 95a5aad..0000000 --- a/srrTomat0/processor/star.py +++ /dev/null @@ -1,218 +0,0 @@ -import asyncio -import os -import shutil -import subprocess - -import numpy as np - -from srrTomat0.processor.utils import get_file_from_url, file_path_abs, get_genome_file_locs -from srrTomat0 import STAR_EXECUTABLE_PATH - -STAR_COUNT_FILE_NAME = "ReadsPerGene.out.tab" -STAR_ALIGNMENT_FILE_NAME = "Aligned.out.sam" -STAR_COUNT_FILE_METAINDEXES = ["N_unmapped", "N_multimapping", "N_noFeature", "N_ambiguous"] -STAR_COUNT_FILE_HEADER = ["Total", "MinusStrand", "PlusStrand"] -STAR_COUNT_COLUMN = "Total" - -STAR_DEFAULT_MKREF_OPTIONS = [] -STAR_DEFAULT_COUNT_OPTIONS = [] - - -# TODO: test this -def star_align_fastqs(srr_ids, fastq_file_names, reference_genome, output_path, num_workers=4, threads_per_worker=5, - star_options=STAR_DEFAULT_COUNT_OPTIONS): - """ - Take a set of FASTQ files and align them with the STAR aligner - - :param srr_ids: list(str) - NCBI SRR ID string - :param fastq_file_names: list(list(str)) - A list of complete FASTQ file names that were unpacked from the SRR file (including path) - :param reference_genome: str - A path to the STAR reference genome that was preassembled - :param output_path: str - The path to put the output alignment files - :param num_workers: int - Number of separate simultaneous jobs to run - :param threads_per_worker: int - Number of threads to assign to each job in STAR (--runThreadN) - :param star_options: list(str) - A list of options to pass to the STAR aligner - :return sam_file_names: list(str) - The SAM alignment files generated by STAR (including path) - """ - - sem = asyncio.Semaphore(num_workers) - - # Build output paths for STAR from SRR ids - output_paths = list(map(lambda x: os.path.join(output_path, x, ''), srr_ids)) - - # Build STAR tasks - tasks = [_star_align(sid, fqfn, reference_genome, sout, sem, - threads_per_worker=threads_per_worker, star_options=star_options) - for sid, fqfn, sout in zip(srr_ids, fastq_file_names, output_paths)] - - # Run and return STAR tasks - return asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks)) - - -# TODO: test this -async def _star_align(srr_id, fastq_file_names, reference_genome, output_path, semaphore, - threads_per_worker=5, star_options=STAR_DEFAULT_COUNT_OPTIONS): - """ - Align an individual set of FASTQs from an SRA to the reference genome - :param srr_id: str - NCBI SRR ID string - :param fastq_file_names: list(str) - A list of FASTQ files for the SRR ID - :param reference_genome: str - A path to the STAR reference genome - :param output_path: str - A path to the output - :param semaphore: asyncio.Semaphore - Semaphore for resource utilization - :param threads_per_worker: int - Number of threads to assign to each job in STAR (--runThreadN) - :param star_options: list(str) - A list of options to pass to the STAR aligner - :return output_file: str - The path to the SAM file generated by STAR - """ - async with semaphore: - - if fastq_file_names[0] is None: - return None - - try: - os.makedirs(output_path) - except FileExistsError: - pass - - output_file = os.path.join(file_path_abs(output_path), STAR_ALIGNMENT_FILE_NAME) - - if os.path.exists(output_file): - print("{id} SAM alignment file exists ({path})".format(id=srr_id, path=output_path)) - return output_file - - # Build the STAR executable call - star_call = [STAR_EXECUTABLE_PATH, - "--runThreadN", str(threads_per_worker), - "--runMode", "alignReads", - "--readFilesCommand", "zcat", - "--genomeDir", reference_genome, - "--outFileNamePrefix", os.path.join(file_path_abs(output_path), ''), - "--readFilesIn", *fastq_file_names, - "--outFilterType", "BySJout"] - - # Add in any additional options - star_call.extend(star_options) - - print(" ".join(star_call)) - process = await asyncio.create_subprocess_exec(*star_call) - code = await process.wait() - - if int(code) != 0: - print("STAR failed for {id} ({files})".format(id=srr_id, files=" ".join(fastq_file_names))) - return None - - return output_file - - -# TODO: test this -def star_mkref(output_path, genome_file=None, annotation_file=None, default_genome=None, - star_options=STAR_DEFAULT_MKREF_OPTIONS, cores=1, gff_annotations=None, - star_executable=STAR_EXECUTABLE_PATH, move_files=True): - """ - Make a reference genome index for STAR to align reads to - :param output_path: str - Path to output reference index into - :param genome_file: list(str) - Genome sequences (usually FASTA) - :param annotation_file: str - Annotation file (usually GTF or GFF) - :param default_genome: str - A string to identify one of the common genomes - This will cause the genome data to be downloaded from ENSEMBL - :param star_options: list - A list of additional options to pass to STAR - :param cores: int - Number of cores to pass to STAR - :param gff_annotations: bool - Flag for GFF3 (instead of GTF) annotations. If None, it will autodetect .gff files. - :param star_executable: str - Path to the STAR executable - :param move_files: bool - Move the genome/annotation files to a `files` path in the STAR reference genome. If false, just copy. - :return output_path: str - Location where the reference genome has been created - """ - - # Get default genome files from the internet if needed - if (genome_file is None or annotation_file is None) and default_genome is None: - raise ValueError("star_mkref() requires (genome_file AND annotation_file) OR default_genome to be passed") - elif default_genome is not None: - ((genome_url, genome_file), (annotation_url, annotation_file)) = get_genome_file_locs(default_genome) - genome_file = [get_file_from_url(genome_url, genome_file)] - annotation_file = get_file_from_url(annotation_url, annotation_file) - - # Create the output path - output_path = file_path_abs(output_path) - try: - os.makedirs(output_path) - except FileExistsError: - pass - - # Uncompress the genome file if it's gzipped - for i, gf in enumerate(genome_file): - if gf.endswith(".gz"): - subprocess.call(["gunzip", gf]) - genome_file[i] = gf[:-3] - - # Uncompress the annotation file if it's gzipped - if annotation_file.endswith(".gz"): - subprocess.call(["gunzip", annotation_file]) - annotation_file = annotation_file[:-3] - - # Build the STAR executable call - star_call = [star_executable, - "--outFileNamePrefix", os.path.join(file_path_abs(output_path), ''), - "--runThreadN", str(cores), - "--runMode", "genomeGenerate", - "--genomeDir", output_path, - "--genomeFastaFiles", *genome_file, - "--sjdbGTFfile", annotation_file] - - # Add any passed-in options - star_call.extend(star_options) - - # Set a flag for STAR if it's a small genome - # Sum file sizes as a proxy for genome size (approximately correct for ASCII files) - star_sa_idx_size = sum(map(lambda x: os.path.getsize(x), genome_file)) - # Calculate genomeSAindexNbases value with the weird equation from the STAR manual - star_sa_idx_size = int(np.floor(np.log2(star_sa_idx_size) / 2 - 1)) - if star_sa_idx_size < 14: - star_call.extend(["--genomeSAindexNbases", str(star_sa_idx_size)]) - - # Set a flag for STAR if the annotation file is GFF3 - if (gff_annotations is None and ".gff" in annotation_file) or gff_annotations: - star_call.extend(["--sjdbGTFtagExonParentTranscript", "Parent"]) - - # Execute STAR - print(" ".join(star_call)) - subprocess.call(star_call) - - output_file_path = os.path.join(output_path, "files") - try: - os.mkdir(output_file_path) - except FileExistsError: - pass - - if move_files: - file_func = os.rename - else: - file_func = shutil.copy2 - - [file_func(file, os.path.join(output_file_path, os.path.basename(file))) for file in genome_file] - file_func(annotation_file, os.path.join(output_file_path, os.path.basename(annotation_file))) - - return output_path diff --git a/srrTomat0/processor/utils.py b/srrTomat0/processor/utils.py deleted file mode 100644 index c24323d..0000000 --- a/srrTomat0/processor/utils.py +++ /dev/null @@ -1,142 +0,0 @@ -import argparse -import os -import shutil -import subprocess -import sys -import numpy as np - -if sys.version_info[0] < 3: - print("The srrTomat0 package requires python3") - exit(1) - -import urllib.parse -import urllib.request - -from srrTomat0 import STAR_EXECUTABLE_PATH, PREFETCH_EXECUTABLE_PATH, FASTQDUMP_EXECUTABLE_PATH - -# Tuple of ((fasta_url, fasta_file_name), (gff_url, gff_file_name)) - -_HG38 = (( - "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.fna.gz", - "hg38.fa.gz"), - ( - "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gtf.gz", - "hg38.gtf.gz")) - -_SC64 = ( -("ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz", - "sc64.fa.gz"), -("ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.gtf.gz", - "sc64.gtf.gz")) - -_MM10 = (( - "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.26_GRCm38.p6/GCF_000001635.26_GRCm38.p6_genomic.fna.gz", - "mm10.fa.gz"), - ( - "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.26_GRCm38.p6/GCF_000001635.26_GRCm38.p6_genomic.gtf.gz", - "mm10.gtf.gz")) - -# Key by genome name -_DEFAULT_GENOMES = {"hg38": _HG38, "sc64": _SC64, "mm10": _MM10} - -# Requirements tests (produce version for each requirement) -_TEST_REQUIREMENTS = {'prefetch': ("", [PREFETCH_EXECUTABLE_PATH, "--version"]), - 'fastq-dump': ("", [FASTQDUMP_EXECUTABLE_PATH, "--version"]), - 'STAR': ("STAR : ", [STAR_EXECUTABLE_PATH, "--version"])} - - -def get_genome_file_locs(genome): - if genome in _DEFAULT_GENOMES.keys(): - return _DEFAULT_GENOMES[genome] - else: - raise ValueError("Genome must be one of {k}".format(k=" ".join(_DEFAULT_GENOMES.keys()))) - - -def get_file_from_url(file_url, file_name_local=None): - """ - Download a file from a url to a local file - :param file_url: - :param file_name_local: - :return: - """ - - if file_name_local is None: - file_name_local = file_path_abs(urllib.parse.urlsplit(file_url).path.split("/")[-1]) - - print("Downloading {url} to {file}".format(url=file_url, file=file_name_local)) - - with urllib.request.urlopen(file_url) as remote_handle, open(file_name_local, mode="wb") as local_handle: - shutil.copyfileobj(remote_handle, local_handle) - - return file_name_local - - -def file_path_abs(file_path): - """ - Convert a file path to a safe absolute path - :param file_path: str - :return: str - """ - return os.path.abspath(os.path.expanduser(file_path)) - - -def test_requirements_exist(test_targets=_TEST_REQUIREMENTS.keys(), test_package=_TEST_REQUIREMENTS, test_htseq=True, - test_chroma=False): - """ - Test that the requirements to run this package exist. Print versions of what can be found and raise a ValueError if - any required software is missing. - :param test_targets: list(str) - A list of packages to test - :param test_package: dict - A dict, keyed by package names, of the commands to run to test the package versions - :param test_htseq: bool - Test for the python HTSeq package - :param test_chroma: bool - Test for the python ChromA package - :return: - """ - - failed = False - - for req, (pref, cmd) in {k: test_package[k] for k in test_targets}.items(): - try: - proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout = pref + " ".join([l for l in proc.stdout.decode().strip().split("\n") if l.strip() != ""]) - print(stdout) - except subprocess.CalledProcessError: - print("{req} : {req} not found (CalledProcessError)".format(req=req)) - failed = True - - if test_htseq: - try: - import HTSeq - print("HTSeq : " + str(HTSeq.__version__)) - except ImportError: - print("HTSeq : HTSeq not found (ImportError)") - failed = True - - if test_chroma: - try: - import ChromA - print("ChromA : " + str(ChromA.__version__)) - except ImportError: - print("ChromA : ChromA not found (ImportError)") - failed = True - - if failed: - raise FileNotFoundError - - return True - - -# ArgumentParser that tests requirements if it fails to parse arguments -# I just want to run a script with no arguments and have it test dependencies -class ArgParseTestRequirements(argparse.ArgumentParser): - - def error(self, message): - try: - test_requirements_exist() - except FileNotFoundError: - pass - finally: - super(ArgParseTestRequirements, self).error(message) diff --git a/srrTomat0/srrTomat0_mkref.py b/srrTomat0/srrTomat0_mkref.py deleted file mode 100644 index 5d71ee4..0000000 --- a/srrTomat0/srrTomat0_mkref.py +++ /dev/null @@ -1,32 +0,0 @@ -import argparse - -from srrTomat0.processor.star import star_mkref - - -def main(): - ap = argparse.ArgumentParser(description="Create a reference genome. All other arguments will be passed to STAR.") - ap.add_argument("-f", "--fasta", dest="fasta", help="FASTA FILE(s)", nargs="+", metavar="FILE", default=None) - ap.add_argument("-a", "--annotation", dest="annotation", help="Annotation GTF/GFF FILE", metavar="FILE", - default=None) - ap.add_argument("-g", "--genome", dest="genome", help="Create standard ref genome", metavar="PATH", default=None) - ap.add_argument("-o", "--out", dest="out", help="Output PATH", metavar="PATH", required=True) - ap.add_argument("--cpu", dest="cpu", help="NUMBER of cores to use", metavar="PATH", type=int, default=4) - - args, star_args = ap.parse_known_args() - - if (args.fasta is None or args.annotation is None) and args.genome is None: - print("One of (--fasta and --annotation) or --genome must be set. Not neither.") - exit(0) - elif (args.fasta is not None or args.annotation is not None) and args.genome is not None: - print("One of (--fasta and --annotation) or --genome must be set. Not both.") - elif args.genome is not None: - star_mkref(args.out, default_genome=args.genome, cores=args.cpu, star_options=star_args) - elif args.fasta is not None and args.annotation is not None: - star_mkref(args.out, genome_file=args.fasta, annotation_file=args.annotation, star_options=star_args, - cores=args.cpu) - else: - raise ValueError("Switch error") - - -if __name__ == '__main__': - main() diff --git a/srrTomat0/tests/__init__.py b/srrTomat0/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/srrTomat0/tests/artifacts/M00799_2.00.txt b/srrTomat0/tests/artifacts/M00799_2.00.txt deleted file mode 100644 index 41ecdd1..0000000 --- a/srrTomat0/tests/artifacts/M00799_2.00.txt +++ /dev/null @@ -1,10 +0,0 @@ -Pos A C G T -1 0.248650039776609 0.26139859992769 0.241301320519092 0.248650039776609 -2 0.392226269785661 0.219606847798542 0.0702344472606129 0.317932435155184 -3 0.0257615986027584 0.0113462234969035 0.942829024734492 0.0200631531658465 -4 0.931183694119653 0.0181098604626899 0.00917476304082202 0.0415316823768348 -5 0.0209101275685474 0.0204960793014344 0.00500398009361691 0.953589813036401 -6 0.652610298711976 0.0667517267238459 0.0282467996828702 0.252391174881308 -7 0.541666052707409 0.110459581183674 0.158324966820951 0.189549399287965 -8 0.196060485729292 0.286696226860535 0.350171562229744 0.167071725180429 -9 0.286129811839987 0.225077681884186 0.309960724838339 0.178831781437488 diff --git a/srrTomat0/tests/artifacts/sc64_mito.fasta b/srrTomat0/tests/artifacts/sc64_mito.fasta deleted file mode 100644 index 2b88a72..0000000 --- a/srrTomat0/tests/artifacts/sc64_mito.fasta +++ /dev/null @@ -1,1431 +0,0 @@ ->Mito dna:chromosome chromosome:R64-1-1:Mito:1:85779:1 REF -TTCATAATTAATTTTTTATATATATATTATATTATAATATTAATTTATATTATAAAAATA -ATATTTATTATTAAAATATTTATTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGGCCCC -GGAATTATTAATTAATAATAAATTATTATTAATAATTATTTATTATTTTATCATTAAAAT -ATATAAATAAAAAATATTAAAAAGATAAAAAAAATAATGTTTATTCTTTATATAAATTAT -ATATATATATATAATTAATTAATTAATTAATTAATTAATAATAAAAATATAATTATAAAT -AATATAAATATTATTCTTTATTAATAAATATATATTTATATATTATAAAAGTATCTTAAT -TAATAAAAATAAACATTTAATAATATGAATTATATATTATTATTATTATTAATAAAATTA -TTAATAATAATCAATATGAAATTAATAAAAATCTTATAAAAAAGTAATGAATACTCCTTT -TTAAAAATAAAAAGGGGTTCGGTCCCCCCCCTTCCGTATACTTACGGGAGGGGGGTCCCT -CACTCCTTCTTAATTAAATTATCTTAATTAAATTATCTTAATTAAATTATCTTAATTAAA -TTATCTTAATTAAATTATCTTAATTAAATTAAAAGGGGACTTTATATTTATAAAGTAATT -ATATTATTATTATTATTATTATTTATTTATTTTATTTTTATTATTTTATTATATATATTA -TATATTAATACAGATAGAAGCCAAAAGGTCAGGCGCTTTCTTTGGGAGAAAGACCTAGTT -AGTTCGAGTCTATCCTATCTGATAATAATTTAATTAACCATTAAAAAAAAGTATATATAT -TTATCATAATATATTAAATTTTATTACATTACAAATGAACACTTTTATTTATATTTATAA -AAATATGAACTCCTTCGGGGTCCGCCCCGCGGGGGCGGGCCGGACTCCATATTATTATTA -TTATAATTATTATTATAATTATTATTATAATTATTATTATAATTATTATTATAATTAAAG -AGTTTTGGATACCAATATGATATAATATGATATAGGACCGAAACCCCTCATTTTATCATT -TATTTATAATATTATAAATAAAAAAAAATATTATATATTATAATAAAATTAATATCATAA -TATATTATATTATATATTATATTATATATATATATATATATATTCTTTTATAAAATTTAT -ATTCTTCTTATTAAAATTAAAAAGGGAGCGGACTTTTAATTATATTTAATTATAGTTTTT -AATCATTGGTTGAGATTTCAAAATAAGGTATAATATTTATATTATTCTTTAACAAATATT -ATATTATAAAAAAAGATATAATATTTATATTATTCTTTAACAAATATTATATTATAAAAA -AGATATAATATTTATATATTATTATTAATATTATTTTTAAGTTCCGAAAGGAGAAACTTA -TAATTTTTATATCATTATTTATTATTATTTTTAATTTCAACTCCTTTTAGGTATTTCCAT -TTAACTTTCAGCAGAGACTTTCTAATTATAATTATATATATATAAATTTAAATACATTTA -TAAAAAAGTATATAATATAATTATATTATATATAATAATATTATTAAATGAAGTATTCTT -TATTATTAATTATAGGATATCTGGGGTCCATTAATAATTATTATTGTAAATAATAATAAG -GACCCCCCCCATTATCTAATTAATAAATATATAAATAATCATTAATAAATATATTAATAA -TTATTAATAAATATATAAATAATCATTAATAAATATATAAATAATATAATATATTATAAA -AATATAATAATAATAATTTATTATTAAAATATAATAATTTATTATAAAAATATAATAATT -TATTATAAAAATATAATAATAACTCCTTTCGGGGTTCACACCTTTATAAATAATAAATAA -TAAATAATAAATAATAAATAATAAATATTAGTATTCACTAATATAAAATAATAATTATAA -AAATAATCATTATTAAAAATATTATTAATTATTAAATTAAATACAATTAATATAATTTAG -TTGTTTATATAATTTTAAATAATGTTTATATCAATTTAATAAAATTAAATTTATAGTTCC -GGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGTTTATCTATATATTATAATAACTA -TATGAATTTAATTATTAAAAATAATAAAAATAAGGAATTTTAATAAGAAGTAATATTTAT -TATATAATATATAAAAAAAATATATATATATATATAAAAATATATATAATAAGTTTTATT -ATAATATATATTAAATTAATTATTATGAGGGGTTCGGTCCCTTTCCGGGCCCCAATTCAT -CTCATCTCATTTTATTTCATTTCAATATCATCTAATCTCATTTCTTTATAGATTTTACAT -ATATATAAATATAAATATAAGATATTCACATTTATATATAATATAATATAATATAATAGA -TATTCATTCCTCTTTGATTAAACTAATAATTAATAATTAATAATTAATAATTAATAATTA -ATAATTATTCAGTAGAACTCCTTCTTAAAAAGGGGTTCGGTCCCCCTCCCATTAGTATAG -TATAGGGAGGGGTCCCTCACTCCTTCGGGGTCCGCCCCGCAGGGGGCGGGCCGGACTATT -ATTAAATAATTTATAATTTATTATTTATTAATATATTTATATAATATAATATAATATAAT -ATTATTCATACTTTTTATTAATATAATATAATATAATATTATTAATACTTTCTCCTTTCG -GGGTTCCGGCTCCCGTGGCCGGGCCCCGGAACTATTAATATAAAGAAAAGAGTTTCAATT -ATTTATTTATTTATTTATTTTTTATAAAAATAAGTCCCCGCCCCGGCGGGGACCCCGAAG -GAGTATTAATTTAAATAATTTATTTAATGAAATTATTAATTATAAATAAAAATAATAATT -TTTAAAGATGTAATATAAAAATAAATATAATATAATTTAGGATAATTATATAAAATATTT -ATTATATATAGTTTTTATAAAGAGTTTTAAAAGTGATAATATAATATATAATATTTATAA -GTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGTTATTTATATATATATAAT -TATAATCTTATTAATTATTTATATATATATTTAATATTATTTTTATATAATTTTATATTA -AAGTATTATAATTATATATTTAATATTATTTTTATATAATTTTATATTATTTATTTATTT -ATTTATTTATTTAAAAATATTATAATCATATATTTAATATTATTTAATATATTTTATATA -TTATATCTTTTATTGATTTATATATATATAGATTTAATAAATATATATATATATATATAT -ATAAATATTCATTATATATTTATTATTATTATTATTATTTATTACTATTTTTTATTATAT -ATTAATAATATATATATTATTAGTTATGGGTATCCTAATAGTATATTATTATTTTTAATA -ATAATTTATGATTTATGTATAATAAATAAGTAGGGAATCGGTACGAATATCGAAAGGAGT -TATATATTATTAATTATTTATAATTATTTTATATATTATTAATTATTTATAATTATTTTA -TATATTTATAATTATTTTATATAGATAGGTTAGATAGGATAGATAGTATAGATAGGGGTC -CCATTTATTATTTACAATAATAATTATTAATGGGACCCGGATATCTTATTGTTATTAATT -TATATATTATTCATTATTATTAATATATATTTAATATAATTAAATATTATATTATATTAT -ATTATATTATTTATTAAAAAAAAATCTATTACTTATTTTTTTTATTAATATATAAATTAT -TTATATAATTTATCATTTTTATTTATATATTATTATTTTTTATATATAAATTAATATATA -TATATATTATATATACTTTTTTTTTTATAATATATCTATATATATAAATAAATATATTAT -ATTATATTTTTATATAATATATTATTAATTATTATTTTAATTTTCTATTCTATTGTGGGG -GTCCCAATTATTATTTTCAATAATAATTATTATTGGGACCCGGATATCTTCTTGTTTATC -ATTTATTATTTTATTAAATTTATTATTATTTTTAATTTATATTTATATTATATAATTAAT -TATATCGTTTATACTCCTTCGGGGTCCCCGCCGGGGCGGGGACTTTATATTTTATTATAT -AATATATTATATTCTTATAATATATTTATTGATTATGTTATAAAATTTATTCTATGTGTG -CTCTATATATATTTAATATTCTGGTTATTATCACCCACCCCCTCCCCCTATTACGTCTCC -GAGGTCCCGGTTTCGTAAGAAACCGGGACTTATATATTTATAAATATAAATCTAACTTAA -TTAATAATTTAAATAATATACTTTATATTTTATAAATAAAAATAATTATAACCTTTTTTA -TAATTATATATAATAATAATATATATTATCAAATAATTATTATTTCTTTTTTTTCTTTAA -TTAATTAATTAATTAATATTTTATAAAAATATATTTCTCCTTACGGGGTTCCGGCTCCCG -TAGCCGGGGCCCGAAACTAAATAAAATATATTATTAATAATATTATATAATATAATAATA -ATATAATAATTTTATATAAATATATATTTATATATTAAATTAAATTATAATTTTATTATG -AAAATTATATCTTTTTTTTATATTTTTATATAATAAAAATATGTTATATATATATTAATA -ATAAAAGGTAGTGAGGATTAAATAAATTATATAATAATTATAACTCTTAATTATAAAATA -AATATATATATATATATAAGTATCCATTTCCATATAATCTTTTAATAAATATTAATAAAT -ATTAAAAAAAAATAATATTATAATATTTTAGTATATAATTCAATAAAATTCATTGGAGGG -GTAAATAATAATAATTTACTAATGGCAAGTTATAGTCTTAAAGGTTTTTATTTTTTTTAT -TAAATTAATAAAATAATAATACCATTTATATATTCCATTATATATATATATTTAATAAAA -ATAATAATATCATTTATATATTTTATTATATATTATATATATTTTATATAAAATAATAAT -AATAAATTTATATTTTTATATATTATTATTAAATAATAATAATATAAATAACTCCTTCGG -GGTTCGGTCCCCACGGGTCCCTCACTCCTTCTTAAGAATAAAAAGGGGTTCGGTCCCCCT -CCCGTTAGTACACGGGAGGGGGTCTCTCACTCCTTCTTAAAAAATAAAAAGGTGGAAGGA -CTAATATAATTTTAAATAATAATTAATACTTTAATAATAATTTGTATTTCTTTATTATTA -ATATATTAAATATAATAATAATTAATATAATTACAATATATTAATATTATCAAATATTAA -TAAATATACTTTTTTATATAATTTATTTATTTATTTATTTTTTTTTTATTAAACTAATTA -TAATTGTAATTTCGAAAAGGGGGTGGGAGTAAACATATATAATTTATAATCTATATATAT -ATATATATAATTTTTTAATAAATATTAATAAATATTTATAAAAAGAATAATTTATATTTA -TAATATATAATTTATATATTTTATTTTTATTATACAATTAATATAAAATATAAAATATTA -AATATTAAATATTAAATATTAAATATTAAATATTAATTTTTATAGGGGTTATATAATAAT -TATATTTATAATTATATAATATTAAAAAGGGTATTTTTATAATTATTACATTTTTATTTT -ATTTATAAAAATATTAATTTTAATAAGTATTGAATACTTTATATAATATAAATATTAATT -ACATAATTAATAATTAAATAATATTTAATAATATTATTTAAATTTATTATTTATAATTAT -TTATTTATAAAATTCTATTTTTATTATTATTATTTTTATTTTATTATTAAAGATTAATAT -AATAATTATTAATATATTAAAAATCTTTTATTATATTAATATTTATAAAAAAGTATTTAA -TAAAAAAGATGTATAAATTTATAAATTATATAATATTATTAATTTATATAATAATAATAT -TATAACTTTGTGATTGTCAATTTAGTTAATCATTGTTATTAATAAAGGAAAGATATAAAA -AATATTCTCCTTCTTAAAAAGGGGTTCGGTTCCCCCCCGTAAGGGGGGGGTCCCTCACTC -CTTTGGTCGGACTCCTTCGGGGTCCGCCCCGCGGGGGCGGGCCGGACTAATTTAACTTTT -AATATTAATATTAATATTATTTATATTTTTAATATATAAAAATAAATAATTTTATTTTTA -TTAATAGTATATTATATAAACAATAAAATAGTATTAATTATATAAAATTTATATAAAATA -TATATAAATTTATTATATATATATATATTAATATTTTAATAAAGTTTTTATTATAAATTT -ATTTATTTATTTATTATAATATTAATAATTTATTTATTATTATATAAGTAATAAATAATA -GTTTTATATAATAATAATAATATATATATATATATATTATTATATTAGTTATATAATAAG -GAAAAGTAAAAAATTTATAAGAATATGATGTTGGTTCAGATTAAGCGCTAAATAAGGACA -TGACACATGCGAATCATACGTTTATTATTGATAAGATAATAAATATGTGGTGTAAACGTG -AGTAATTTTATTAGGAATTAATGAACTATAGAATAAGCTAAATACTTAATATATTATTAT -ATAAAAATAATTTATATAATAAAAAGGATATATATATAATATATATTTATCTATAGTCAA -GCCAATAATGGTTTAGGTAGTAGGTTTATTAAGAGTTAAACCTAGCCAACGATCCATAAT -CGATAATGAAAGTTAGAACGATCACGTTGACTCTGAAATATAGTCAATATCTATAAGATA -CAGCAGTGAGGAATATTGGACAATGATCGAAAGATTGATCCAGTTACTTATTAGGATGAT -ATATAAAAATATTTTATTTTATTTATAAATATTAAATATTTATAATAATAATAATAATAA -TATATATATATAAATTGATTAAAAATAAAATCCATAAATAATTAAAATAATGATATTAAT -TACCATATATATTTTTATATGGATATATATATTAATAATAATATTAATTTTATTATTATT -AATAATATATTTTAATAGTCCTGACTAATATTTGTGCCAGCAGTCGCGGTAACACAAAGA -GGGCGAGCGTTAATCATAATGGTTTAAAGGATCCGTAGAATGAATTATATATTATAATTT -AGAGTTAATAAAATATAATTAAAGAATTATAATAGTAAAGATGAAATAATAATAATAATT -ATAAGACTAATATATGTGAAAATATTAATTAAATATTAACTGACATTGAGGGATTAAAAC -TAGAGTAGCGAAACGGATTCGATACCCGTGTAGTTCTAGTAGTAAACTATGAATACAATT -ATTTATAATATATATTATATATAAATAATAAATGAAAATGAAAGTATTCCACCTGAAGAG -TACGTTAGCAATAATGAAACTCAAAACAATAGACGGTTACAGACTTAAGCAGTGGAGCAT -GTTATTTAATTCGATAATCCACGACTAACCTTACCATATTTTGAATATTATAATAATTAT -TATAATTATTATATTACAGGCGTTACATTGTTGTCTTTAGTTCGTGCTGCAAAGTTTTAG -ATTAAGTTCATAAACGAACAAAACTCCATATATATAATTTTAATTATATATAATTTTATA -TTATTTATTAATATAAAGAAAGGAATTAAGACAAATCATAATGATCCTTATAATATGGGT -AATAGACGTGCTATAATAAAATGATAATAAAATTATATAAAATATATTTAATTATATTTA -ATTAATAATATAAAACATTTTAATTTTTAATATATTTTTTTATTATATATTAATATGAAT -TATAATCTGAAATTCGATTATATGAAAAAAGAATTGCTAGTAATACGTAAATTAGTATGT -TACGGTGAATATTCTAACTGTTTCGCACTAATCACTCATCACGCGTTGAAACATATTATT -ATCTTATTATTTATATAATATTTTTTAATAAATATTAATAATTATTAATTTATATTTATT -TATATCAGAAATAATATGAATTAATGCGAAGTTGAAATACAGTTACCGTAGGGGAACCTG -CGGTGGGCTTATAAATATCTTAAATATTCTTACATAAATATTAATCTAAATATTAATATA -AATATTAATATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGAAA -TATTAATATAAATATAAATATTAATATAAATATAAATATAAATATAAATATATTTTAATA -TAATATAATATAATATATAATATATTATATAAATATAATATATAAATAATATAATAAAAT -ATTTTAATATATATATAATATAATATAATTATTATTATAATTTAATATAAATTATTATTA -TAATTTAATATAATAAATAAATAAATAATTATAATTATAATTATAATTATAATCTCAATA -TATAAATGATAAATTATTATAAATACAAAGGAAATAATTGATTTTTAAAATATATTTAAT -AAAATATATAATATAAATTATACTTTTTTTGTTATTATATAATAATTATATTAATATATT -TAATAGAATTAAACTCCTTCGGCCGGACTATTATTCATTTTATATATTAATGATAAATCA -TTAATTATTATTAATAAATTTATTTATAATATTTAATTTTATATATTATTATTTATAATA -AAAAAAATTATATTATAACAATTTAATTTTAATTTTTATTTTTAAATTATAAAATTAATA -ATTTATTTGTTTAAATAAAATTTATAACTCCTTCGGGGTTCGGCCGGACTATTAATATAA -ATAAATAATAAATATTTATAATAAAATAATATACATCTTCTTTAAATAAAAAAAGGGGAC -ATTATAAATAGTATATAAATATATTATATCTTTTTTATTATTATTATTAATAAATAATAA -TAATAATTTATATATTTATAATATATTTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGA -ACCCCGAAAGGAGAATGTATTATAATTATTACATATAATTATTATTATTCACTTCTTATT -AAAAATAATACTCTATATAATTTATATAATTTATTTTAATATATATATATTTATATATAA -TATAATATATATATTTATTTATTATAATCATTTTTTTTTAACTTAAAATAAAACTTATTA -TAATTTATATAATTTATAATTTTTATATAAAAATAATTATATAATTTTTATTTATTTATA -TAATAATAATATTATTTGTTATATATTATATATTATATATATAATAAATAAATAAATAAT -AAATAATAATAATAAGGATATAGTTTAATGGTAAAACAGTTGATTTCAAATCAATCATTA -GGAGTTCGAATCTCTTTATCCTTGATAATAATAATAAAAATATGTATTTATTTAATTATT -TTAATATTTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGGCCCCGGAACTATTAATATA -ATATAATATAATATAAATATTCATTTATCTTTTTTTTAATATTCTTAATTAATTAATTAA -TTAATATATTAATTATAAAAAATATATTATAATTTTATTATTAATAAGTATAAATATATT -ATTAATAATAATTTATTAAAAATATATTATTATAATATATTAATATATCATAATTATAAT -CAATATTATATTATTTAATTTTATAATACTTAATTATTAATATATTATTCATATATATAT -AAATTAAATTAAATTAATTATATTGAATATATAAATATATATATATATAAATATATAAAA -AATTATATAAATTATTTTAAGTAAAAATAATATTAATAAAAATTATACAATAATAATAAT -AAATATTCATTATTATTTAATTAATATCTCCTTTACTTCTTTTTCCTCCGTTGAGGACTT -ATTATTAAGTATATTATTATATACTACTTAAGATTATATATATAATATATATATATATAT -TATATATAAAATATAAATATATAAATAATATAAAAATTAATAAAATAAATAAAATAAATT -AGTCCGATCGAATCCCCTATTTAATTAAATTAAATTAAATTAAGAAAGAGATAAATTTAT -ATAAAATATTATTTATAATTAATTATAATTAAATTATAATATAATATAATATAAATAATA -ATATAATAAAAATAAAAATAAAATAATATTAGATTATATTATATAATTTATATAATTTTT -TAATAATAATAATAAATAAGTTTATTTATAATTATAAATATAAATATAAATATAAATAAA -GAAGGTATTATATTTTATAAAATATAATAATAATACAAAATTTATATTTTAATAAATATT -AATATAAGTTTAAAGTTCCGGGGCCCGGCACGGGAGCCGGAACCCCGAAAGGAGAAATAA -ATAATATATTTATAAAAAATTAAATAAATAAATATTATCTATTTAAAAATAAATATAATA -TAATATAATATAATAATTCTAAATATAAATAATATTTATTATAATTATTATAATAATTGT -ATTATTTATTAATAATATATATAATTATATTAAAACTAATATTACATTATTTTGTATATT -TAAACAATTAAATTGATTATTCTTATTTGTAATCTTTATTTATTTTATTATATCTTATTA -ATGATAAATTATAATTATTATTAAAATAATAATTTACTTCTTTTGATATAAAAATAAAAT -AATATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGGAAGGAGATAAATATATTAT -ATTTTTATTCCTACCTATTAAAGGTAAAGACTCGATTCTCATAATTAAATTTATATCCTT -CGGCCGGATTAATTTATTTTATTTATATTTATATTTATAGTGAATACCTTTTTTAATATT -TATTTTTAATATTTATTTTTAATATTTTATTTTTAATAAAATATAATCTTGTAAGTAAGA -AAAGAATTTCGGTGATTGGAACCTTGAAAGGATAAATTTCTTATTTATTATAATATTTAT -ATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGTATTATTAAACA -TTTAATATATTATATTAATATTTAATTTAAATGATTAATATATTATTATAATAATATTTA -TTTTATATTAAAATATTATAATTAATATATATATATTTATTTTAATAATATTATTATTAT -TATTATTAAAATTATTATTTTTATAAATATATATATATATATATATATATTATTTTTATT -CTTATATAAATTATATAAAAAAAATATATATAATATATAATTAATTAATATATATTATTT -AAATTATATATTATTTAAAATACTTTTTATATTATATCTTCTTTAAATTAAAATATAATT -ATTATTTATATTATAATTATTTATGAAATATTATTATTAAAATAAAAAAGAGGTTTAGAC -TATATATTTATTATTTATAAACTTATTATATTATTTATTATTAATAGTTCCGGGGCCCGG -CCACGGGAGCCGGAACCCCGAAAGGAGAAATAAATAAAATAAAAAATAATAAATATTAAT -ATTATTAAATATTATTTATAATAAATATTAATATTATTAAATATTATTCATATTAATAAA -TTTTATTATTATTTGTAATATATTAAATATTAATAATATATATATTATTTATTATAATGA -AAACCTATCCTATATTATCCTATCATATAATATCATATCATATTATATTATATCTTATTA -TATGATATATAAAGTATTCACTCTATATGAGGTTATGATTATTATATAAATCTTATTTTA -TTTTTATTTTTATTTGGACTAATAATAATTATAATAATAATTATTGATATGTTCTAATAT -TAATAAATACATATTTATATTATAATATAAATATTCATTTCTTACTAATTAATAAAAAGT -TTTTATATTCATTATAATATAAATATATAAATATATATAAATATTTTAATAATTATAATT -ATATTAAGATATTATAAATATATATTTATTTTTTTTTATAAAATAAATAAATAAATAAAT -AATTAATATTTTTATATTATAACTTATTTTTATAATAATAATAAGTATTTTATTTTTTAT -TATATTATTATTTATATAATTATATATATATTAATTTCAATTTAATTAATTAATTAATTG -GTATTTGGCATATAATATCAATTAATTGTAATTCTTATAAGAATTAATTAATTAATATGC -TTTTTATATAATTTATACTTTTATATTTCTCCTTCCGGGGTTCCGGCTCCCGTGGCCGGG -CCCCGGAACTATTATTATTATTTTTATTTATTTATTATTAAAATATAATAATAAATAGTC -CGGCCCGCCCCGCGGGGCGGACGCCGGAGGAGAATTATATTTTTATATAATAATTTATAT -TTCTATATATATATATATATATTATATATAAATATTATTATATATATTTTTATATATATT -ATAATTATATTCATTAATATTTTATTATAGTGGTGGGGTCCCAATTATTATTTTCAATAA -TAATTTATCATGGGACCCGGATATCTTCTTGTTTTTATTTATTATTTTATTAAATTTATT -TTAATTATTTATTTATAATTTATATTATACAATTTATTATTTCGTTAATACCTTTATTTA -TATTATATAATATATTATATTATTATAATATATTTATTGATTATATTAATACATTTAACT -AATGTGTGCTCTATATTTATTGAATAGTTTGGTTCTTATCACCCACCCCCTCCCCCTATT -ACGTCTCCGAGGTCCCGGTTTCGTAAGAAACCGGGACTTATATATTTAATACTAAAAATA -TAACTACATTACTTTTTTAATATATATAACAATATATATATATATATATATTAATTATAT -AAAATATAATACTCTATATTAAATATTATTTTTATCAATATTTATTTATATATATAATAA -TAATAATAATAATCAATATTAATTATTTATATATATAAGATTAATATTATTTAATATATT -ATGAATAATTTAATTAATAAATCTTTAAATATTATCATAAAAATATAAATTAAATAATTT -CTTATTTATAATAAAGAATAATAATATATATAAATATAATAAAGAATGTAAATAATATAT -ATATAATATAATATAATATAAAAAATATATATATATATAAATATATATATAATATATAGA -TAATAATATTTTTATATAATTTATTTTATTATTAAGTAATAAATAATAAAAAAATCAATA -TATTAAATAATATATTTATATTAGTTCGGTTTAGTTGGTATTTTGTAATGAGTAAAAAGT -AATATATAATATTAAATAATAAGTATTGATATAAGTAATAGATATAATAATAATATTATT -AATATTTTATATAAATAATATTAATAATATAGATTATGAAAGAGAGTATTAATATCATTA -AATATATATATATGTTATATAATTTAAATGATTTTAATATATATATATATATTATATTAT -AGATTATGATACATTTATATAAATAATATATATATAAAAATTAATTATACTATTACTTTA -TAATATAATAATATTTATTTATAAAGATATAAAAGAATTGTTTAAAGTTATAACTAAAAT -ATTATATAGTATTCATTAATAATTAATATTATAAATTCAACTATTGTTATATTTATAAAT -AGAATAATATATTATTATCCTTTAAGATATAACAATAATTATTTAAATTAAATTAAATTA -AATTTAATTAATTTTTTTTTTTAATGAATATAATAATAATAATATTATTAAAATTAATAT -ATAAAAAAAAAGTAAAAATGGTACAAAGATGATTATATTCAACAAATGCAAAAGATATTG -CAGTATTATATTTTATGTTAGCTATTTTTAGTGGTATGGCAGGAACAGCAATGTCTTTAA -TCATTAGATTAGAATTAGCTGCACCTGGTTCACAATATTTACATGGTAATTCACAATTAT -TTAATGGTGCGCCTCTCAGTGCGTATATTTCGTTGATGCGTCTAGCATTAGTATTATGAA -TCATCAATAGATACTTAAAACATATGACTAACTCAGTAGGGGCTAACTTTACGGGGACAA -TAGCATGTCATAAAACACCTATGATTAGTGTAGGTGGAGTTAAGTGTTACATGGTTAGGT -TAACGAACTTCTTACAAGTCTTTATCAGGATTACAATTTCCTCTTATCATTTGGATATAG -TAAAACAAGTTTGATTATTTTACGTTGAGGTAATCAGATTATGATTCATTGTTTTAGATA -GCACAGGCAGTGTGAAAAAGATGAAGGACCTAAATAACACAAAAGGAAATACGAAAAGTG -AGGGATCAACTGAAAGAGGAAACTCTGGAGTTGACAGAGGTATAGTAGTACCGAATACTC -AAATAAAAATGAGATTTTTAAATCAAGTTAGATACTATTCAGTAAATAATAATTTAAAAA -TAGGGAAGGATACCAATATTGAGTTATCAAAAGATACAAGTACTTCGGACTTGTTAGAAT -TTGAGAAATTAGTAATAGATAATATAAATGAGGAAAATATAAATAATAATTTATTAAGTA -TTATAAAAAACGTAGATATATTAATATTAGCATATAATAGAATTAAGAGTAAACCTGGTA -ATATAACTCCAGGTACAACATTAGAAACATTAGATGGTATAAATATAATATATTTAAATA -AATTATCAAATGAATTAGGAACAGGTAAATTCAAATTTAAACCCATGAGAATAGTTAATA -TTCCTAAACCTAAAGGTGGTATAAGACCTTTAAGTGTAGGTAATCCAAGAGATAAAATTG -TACAAGAAGTTATAAGAATAATTTTAGATACAATTTTTGATAAAAAGATATCAACACATT -CACATGGTTTTAGAAAGAATATAAGTTGTCAAACAGCAATTTGAGAAGTTAGAAATATAT -TTGGTGGAAGTAATTGATTTATTGAAGTAGACTTAAAAAAATGTTTTGATACAATTTCTC -ATGATTTAATTATTAAAGAATTAAAAAGATATATTTCAGATAAAGGTTTTATTGATTTAG -TATATAAATTATTAAGAGCTGGTTATATTGATGAGAAAGGAACTTATCATAAACCTATAT -TAGGTTTACCTCAAGGATCATTAATTAGTCCTATCTTATGTAATATTGTAATAACATTGG -TAGATAATTGATTAGAAGATTATATTAATTTATATAATAAAGGTAAAGTTAAAAAACAAC -ATCCTACATATAAAAAATTATCAAGAATAATTGCAAAAGCTAAAATATTTTCGACAAGAT -TAAAATTACATAAAGAAAGAGCTAAAGGCCCACTATTTATTTATAATGATCCTAATTTCA -AGAGAATAAAATACGTTAGATATGCAGATGATATTTTAATTGGGGTATTAGGTTCAAAAA -ATGATTGTAAAATAATCAAAAGAGATTTAAACAATTTTTTAAATTCATTAGGTTTAACTA -TAAATGAAGAAAAAACTTTAATTACTTGTGCAACTGAACTACCAGCAAGATTTTTAGGTT -ATAATATTTCAATTACACCTTTAAAAAGAATACCTACAGTTACTAAACTAATTAGAGGTA -AACTTATTAGAAGTAGAAATACAACTAGACCTATTATTAATGCACCAATTAGAGATATTA -TCAATAAATTAGCTACTAATGGATATTGTAAGCATAATAAAAATGGTAGAATAGGAGTGC -CTACAAGAGTAGGTAGATGACTATATGAAGAACCTAGAACAATTATTAATAATTATAAAG -CGTTAGGTAGAGGTATCTTAAATTATTATAAATTAGCTACTAATTATAAAAGATTAAGAG -AAAGAATCTATTACGTATTATATTATTCATGTGTATTAACTTTAGCTAGTAAATATAGAT -TAAAAACAATAAGTAAAACTATTAAAAAATTTGGTTATAATTTAAATATTATTGAAAATG -ATAAATTAATTGCCAATTTTCCAAGAAATACTTTTGATAATATCAAAAAAATTGAAAATC -ATGGTATATTTATATATATATCAGAAGCTAAAGTAACTGATCCTTTTGAATATATCGATT -CAATTAAATATATATTACCTACAGCTAAAGCTAATTTTAATAAACCTTGTAGTATTTGTA -ATTCAACTATTGATGTAGAAATACATCATGTTAAACAATTACATAGAGGTATATTAAAAG -CACTTAAAGATTATATTCTAGGTAGAATAATTACCATAAACAGAAAACAAATTCCATTAT -GTAAACAATGTCATATTAAAACACATAAAAATAAATTTAAAAATATAGGACCTGGTATAT -AAAATCTATTATTAATGATACTCAATATGGAAAGCCGTATGATGGGAAACTATCACGTAC -GGTTTGGGAAAGGCTCTTTAACACGTGGCAACATAGGTTAATTTGCTATTTCATTTTTAG -TAGTTGGTCATGCTGTATTAATGATTTTCTGTGCGCCGTTTCGCTTAATTTATCACTGTA -TTGAAGTGTTAATTGATAAACATATCTCTGTTTATTCAATTAATGAAAACTTTACCGTAT -CATTTTGGTTCTGATTATTAGTAGTAACATACATAGTATTTAGATACGTAAACCATATGG -CTTACCCAGTTGGGGCCAACTCAACGGGGACAATAGCATGCCATAAAAGCGCTGGAGTAA -AACAGCCAGCGCAAGGTAAGAACTGTCCGATGGCTAGGTTAACGAATTCCTGTAAAGAAT -GTTTAGGGTTCTCATTAACTCCTTCCCACTTGGGGATTGTGATTCATGCTTATGTATTGG -AAGAAGAGGTACACGAGTTAACCAAAAATGAATCATTAGCTTTAAGTAAAAGTTGACATT -TGGAGGGCTGTACGAGTTCAAATGGAAAATTAAGAAATACGGGATTGTCCGAAAGGGGAA -ACCCTGGGGATAACGGAGTCTTCATAGTACCCAAATTTAATTTAAATAAAGTGAGATACT -TTAGTACTTTATCTAAATTAAATGCAAGGAAGGAAGACAGTTTAGCGTATTTAACAAAGA -TTAATACTACGGATTTTTCCGAGTTAAATAAATTAATAGAAAATAATCATAATAAACTTG -AAACCATTAATACTAGAATTTTAAAATTAATGTCAGATATTAGAATGTTATTAATTGCTT -ATAATAAAATTAAAAGTAAGAAAGGTAATATATCTAAAGGTTCTAATAATATTACCTTAG -ATGGGATTAATATTTCATATTTAAATAAATTATCTAAAGATATTAACACTAATATGTTTA -AATTTTCTCCGGTTAGAAGAGTTGAAATTCCTAAAACATCTGGAGGATTTAGACCTTTAA -GTGTTGGAAATCCTAGAGAAAAAATTGTACAAGAAAGTATGAGAATAATATTAGAAATTA -TCTATAATAATAGTTTCTCTTATTATTCTCATGGATTTAGACCTAACTTATCTTGTTTAA -CAGCTATTATTCAATGTAAAAATTATATGCAATACTGTAATTGATTTATTAAAGTAGATT -TAAATAAATGCTTTGATACAATTCCACATAATATGTTAATTAATGTATTAAATGAGAGAA -TCAAAGATAAAGGTTTCATAGACTTATTATATAAATTATTAAGAGCTGGATATGTTGATA -AAAATAATAATTATCATAATACAACTTTAGGAATTCCTCAAGGTAGTGTTGTCAGTCCTA -TTTTATGTAATATTTTTTTAGATAAATTAGATAAATATTTAGAAAATAAATTTGAGAATG -AATTCAATACTGGAAATATGTCTAATAGAGGTAGAAATCCAATTTATAATAGTTTATCAT -CTAAAATTTATAGATGTAAATTATTATCTGAAAAATTAAAATTGATTAGATTAAGAGACC -ATTACCAAAGAAATATGGGATCTGATAAAAGTTTTAAAAGAGCTTATTTTGTTAGATATG -CTGATGATATTATCATTGGTGTAATGGGTTCTCATAATGATTGTAAAAATATTTTAAACG -ATATTAATAACTTCTTAAAAGAAAATTTAGGTATGTCAATTAATATAGATAAATCCGTTA -TTAAACATTCTAAAGAAGGAGTTAGTTTTTTAGGGTATGATGTAAAAGTTACACCTTGAG -AAAAAAGACCTTATAGAATGATTAAAAAAGGTGATAATTTTATTAGGGTTAGACATCATA -CTAGTTTAGTTGTTAATGCCCCTATTAGAAGTATTGTAATAAAATTAAATAAACATGGCT -ATTGTTCTCATGGTATTTTAGGAAAACCCAGAGGGGTTGGAAGATTAATTCATGAAGAAA -TGAAAACCATTTTAATGCATTACTTAGCTGTTGGTAGAGGTATTATAAACTATTATAGAT -TAGCTACCAATTTTACCACATTAAGAGGTAGAATTACATACATTTTATTTTATTCATGTT -GTTTAACATTAGCAAGAAAATTTAAATTAAATACTGTTAAGAAAGTTATTTTAAAATTCG -GTAAAGTATTAGTTGATCCTCATTCAAAAGTTAGTTTTAGTATTGATGATTTTAAAATTA -GACATAAAATAAATATAACTGATTCTAATTATACACCTGATGAAATTTTAGATAGATATA -AATATATGTTACCTAGATCTTTATCATTATTTAGTGGTATTTGTCAAATTTGTGGTTCTA -AACATGATTTAGAAGTACATCACGTAAGAACATTAAATAATGCTGCCAATAAAATTAAAG -ATGATTATTTATTAGGTAGAATGATTAAGATAAATAGAAAACAAATTACTATCTGTAAAA -CATGTCATTTTAAAGTTCATCAAGGTAAATATAATGGTCCAGGTTTATAATAATTATTAT -ACTATTAAATATGCGTTAAATGGAGAGCCGTATGATATGAAAGTATCACGTACGGTTCGG -AGAGGGCTCTTTTATATGAATGTTATTACATTCAGATAGGTTTGCTACTCTACTCTTAGT -AATGCCTGCTTTAATTGGAGGTTTTGGTAACCAAAAAAGATATGAAAGTAATAATAATAA -TAATCAAGTAATAGAAAATAAAGAATATAATTTAAAATTAAATTATGATAAGTTGGGACC -TTATTTAGCTGGATTAATTGAAGGTGATGGAACTATTCTAGTTCAAAATTCATCTTCAAT -AAAAAAATCTAAATATAGACCGTTAATTGTTGTAGTATTTAAATTAGAAGATTTAGAATT -AGCTAATTATTTATGTAATTTAACTAAATGTGGAAAAGTGTATAAAAAAATTAATCGTAA -TTATGTATTATGACTTATTCATGATTTAAAAGGTGTATATACATTATTAAATATTATTAA -TGGATATATGAGAACACCTAAATATGAAGCATTTGTTAGAGGTGCTGAATTTATAAATAA -TTATATTAATTCAACAACAATTCTACATAATAAATTAAAAAATATAGATAATATTAAAAT -TAAACCATTAGATACATCAGATATTGGTTCAAACGCTTGATTAGCTGGTATGACAGATGC -AGATGGTAATTTTTCTATTAATTTAATAAATGGTAAAAATCGTTCTAGTAGAGCAATGCC -TTATTATTGTTTAGAATTAAGACAAAATTATCAAAAAAATTCTAATAATAATAATATTAA -TTTTTCTTATTTTTATATTATGTCTGCAATTGCACTATATTTTAATGTTAATTTATATAG -TAGAGAACGTAATTTAAATTTATTAGTATCTCTTAATAATACGTATAAACTATATTATAG -TTATAAAGTAATAGTGGCTAATCTATATAAAAATATTAAAGTAATAGAATACTTTAATAA -ATATTCTTTATTATCATCTAAACACTTAGATTTTTTAGATTGATCTAAATTAGTTATTTT -AATTAATAATGAGGGTCAAAGTATAAAACTTAATGGTAGTTGAGAATTAGGTATAAATTT -ACGTAAAGATTATAATAAAACTAGAACTACGTTTACTTGATCTCATTTAAAAAATACATA -TTTAGAAAATAAATAAATAAATTATTATTACTTTCTTCCCCTCCGAATCCGTAATATATT -TACGGATATATAATCTCGTAGTGTAAAAGGTGTAACGAGATTATTAATAAGTTGCCGTAA -TATATTGTAAAATATATTATTATTACAACACTATATGCGGGAAAACCCTAAAGTCATAAT -ATAATATTATCCCCACGAGGGCCACACATGTGTGGCCCTCGCGGGGTATGGTAAATTTAA -TTAAGTTATAAATGTACTATAGTATTAAAAATTATTATGAATAATTTCCCCACCCCCATG -CGAAGCATGGGGGGGGGTATAAGTATGGACAATCCGCAGGAAACCAAATAATAATTAATA -TCCTGAAACAAAGTAAGTGAAGGAGATATCTTAAAATATATATAATATATATTTTATAAA -TTATTATGTAGGATCCTCAGAGACTACACGTGTTGCACCCATTATATTATGTATAATGGG -TTGAAGATATAGTCCAAATATAATTGAAAGATTATAATAAAATGAACTATTTATTACCAT -TAATAATTGGAGCTACAGATACAGCATTTCCAAGAATTAATAACATTGCTTTTTGAGTAT -TACCTATGGGGTTAGTATGTTTAGTTACATCAACTTTAGTAGAATCAGGTGCTGGTACAG -GGTGAACTGTCTATCCACCATTATCATCTATTCAGGCACATTCAGGACCTAGTGTAGATT -TAGCAATTTTTGCATTACATTTAACATCAATTTCATCATTATTAGGTGCTATTAATTTCA -TTGTAACAACATTAAATATGAGAACAAATGGTATGACAATGCATAAATTACCATTATTTG -TATGATCAATTTTCATTACAGCGTTCTTATTATTATTATCATTACCTGTATTATCTGCTG -GTATTACAATGTTATTATTAGATAGAAACTTCAATACTTCATTCTTTGAAGTATCAGGAG -GTGGTGACCCAATCTTATACGAGCATTTATTTTGATTCTTTGGTCAAACAGTGGCCCTTA -TTATTATATTAATAATATATAATGATATGCATTTTTCTAAATGCTGGAAATTATTAAAAA -AATGAATTACAAATATTATAAGTCTATTATTTAAAGCCTTATTTGTAAAAATATTCATAT -CTTATAATAATCAGCAGGATAAGATAATAAATAATCTTATATTAAAAAAAGATAATATTA -AAAGATCCTCAGAGACTACAAGAAAAATATTAAATAATTCAATAAATAAAAAATTTAATC -AATGATTAGCTGGATTAATTGATGGTGATGGATATTTTGGTATTGTAAGTAAGAAATATG -TATCATTAGAAATTCTAGTAGCATTAGAAGATGAAATAGCTTTAAAAGAAATTCAAAATA -AATTTGGTGGTTCTATTAAATTAAGATCAGGTGTAAAAGCTATTAGATATAGATTACTTA -ATAAAACTGGTATAATTAAATTAATTAATGCAGTTAATGGTAATATTAGAAATACTAAAA -GATTAGTACAATTTAATAAAGTTTGTATTTTATTAGGTATTGATTTTATTTATCCAATTA -AATTAACTAAAGATAATAGTTGATTTGTTGGATTTTTTGATGCTGATGGTACAATTAATT -ATTCATTTAAAAATAATCATCCTCAATTAACAATTTCTGTAACTAATAAATATTTACAAG -ATGTACAAGAATATAAAAATATTTTAGGTGGTAATATTTATTTTGATAAATCACAAAATG -GTTATTATAAATGATCCATTCAATCAAAAGATATAGTATTAAATTTTATTAATGATTATA -TTAAAATAAATCCATCAAGAACACTAAAAATAAATAAATTATATTTAAGTAAAGAATTTT -ATAATTTAAAAGAATTAAAAGCTTATAATAAATCTTCTGATTCAATACAATATAAAGCAT -GATTAAATTTTGAAAATAAATGAAAAAATAAATAAATTATTTAATAAAGATATAGTCCAA -ATTATATATATATAATATATATATATATAACAAGCACCCTGAAGTATATATTTTAATTAT -TCCTGGATTTGGTATTATTTCACATGTAGTATCAACATATTCTAAAAAACCTGTATTTGG -TGAAATTTCAATGGTATATGCTATGGCTTCAATTGGATTATTAGGATTCTTAGTATGATC -ACATCATATGTATATTGTAGGATTAGATGCAGATCTTAGAGCATATTTCCTATCTGCACT -AATGATTATTGCAATTCCAACAGGAATTAAAATTTTCTCATGATTAATAAATCCCTTTAG -CAAGGATAAAAATAAAAATAAAAATAAAAAGTTGATCAGAAATTATCAAAAAATAAATAA -TAATAATATAATAAAAACATATTTAAATAATAATAATATAATTATAATAAATATATATAA -AGGTAATTTATATGATATTTATCCAAGATCAAATAGAAATTATATTCAACCAAATAATAT -TAATAAAGAATTAGTAGTATATGGTTATAATTTAGAATCTTGTGTTGGTATACCTCTATA -TACTAATATTGTAAAACATATAGTAGGTATTCCTAATAATATTTTATATATTATAACAGG -TATTTTATTAACAGATGGTTGAATTGATTATCTATCTAAAAAAGATTTAGATAAAAAAAC -AATTATAGAAATTAATTGTAGATTTAGATTAAAACAATCAATAATTCATAGTGAATATTT -AATATATGTATTTATATTATTATCACATTATTGTATAAGTTATCCTAAAATAAAAATTGC -TAAAGTTAAAGGTAAATCATATAATCAATTAGAATTTTATACTAGATCATTACCATGTTT -TACTATTTTAAGATATATATTTTATAATGGTAGAGTAAAAATTGTACCTAATAATTTATA -TGATTTATTAAATTATGAATCTTTAGCTCATATAATTATATGTGATGGTTCATTTGTAAA -AGGTGGAGGTTTATATTTAAATTTACAATCTTTTCTAACTAAAGAATTAATTTTTATTAT -AAATATTTTAAAAATTAAATTTAATTTAAATTGTCTATTACATAAATCTAGAAATAAATA -TCTTATTTATATAAGAGTAGAATCTGTTAAAAGATTATTTCCTATAATTTATAAATATAT -TTTACCTTCTATAAGATATAAATTTGATATTATATTATGACAAAAAAAATATAATATGAT -TAATTAATTAATTAATTAATTAATTTATTTATTATTTACTTTTTTGATATATATAGAGGC -AAACTCGAGGAAAACCATATAATTAGAATAAGTAATAATTATATGACAACCGTCGAACTA -AATCATATTCAAGAAATTAATATGTAAAAGCGTAGAGATTAGACGCCTCTGGTTATCTAA -GTAATATATATATATATATTATATGATAACATAAGGTATAATCCAATGAGATCAGTAATG -ATTTTAAAACAATAATTTTGTTTTAAGTATTAATAATAATATTAATATTCGACCTCTTAA -TTGAGGATATTATAATCATAATTTTTTATATTATAATATAAAATTTAACTAGCTAGATAA -TATTATATAAAAAAAAAAAATAATATTATATAAATTAATTAAAATAATTTTTATTAATTG -AAACTGAAATGTTTTAAAGTTAAATAAAAGAGCTCTAATCCATGGTGGTTCAATTAGATT -AGCACTACCTATGTTATATGCAATTGCATTCTTATTCTTATTCACAATGGGTGGTTTAAC -TGGTGTTGCCTTAGCTAACGCCTCATTAGATGTAGCATTCCACGATATTAATTTAATAAG -TGTCGTGCTTAAAATTCACTAAAATAATATATAATAAATTATAATAAATATATAAAAAAA -ATAAAAAAAATAAAAAAAAATTAATATCTTATGATTAATTTTATATAAATAAAAATTTAT -TAAATATTATTGGTTATATATATATATATATTAATAATAAAAAAATATATATATATATAT -AGCTAACGGGGAAACTCTTATAATTATTATTTATATAATAAATAAGACAATCCCGTGATA -ACTTTAATATATATATATTATATATTAAAGTATTGTAGAGACTAAACGTGAATGATTTTA -ATATTATTTAAATATTAAAATTAAGAGATAGTCCAATCTTATATGTAAATATAAGTTAAT -ACCAAAAAAAAAATAATATTATTTTGACTTATTATATATTAATATTATTAATAATAATTT -TAACTAATAATAAAGTTTTTATAGAAACTTTATATTATTATTTAATATTTAATTTTCAAT -TAATATCTCCTTTTGGGGTTCCGGTCCCTGGTCCGGCCCCCGAAACTAAAGATATTAAGA -ATTTATATGAATCAATTATAAATAATTATATTAATATTTTAAATAAATATCTTATTAATA -TTAATAAAGATAATATTAATAAATTAAAATTTTTAGATAATTATACTGAAGAAGAAAAAG -GTTATTATTTATCTGGATTATTTGAAGGAGATGGTAATATTTATACTAGATGTTTTTCAA -TTACTTTTTCTTTAGAAGATGTTTTATTAGCTAATTATTTATGTCTTTATTTTAAAATTG -GTCATATTACAGCTAAATATAATTTTAATAAAGAATTAACAGCTGTTAAATGAAATATTA -TAAAAAAAAAAGAACAAGAAGTATTTATAAATTATATTAATGGTAAATTATTAACATATA -AAAGATATGATCAATATTTTAAATATAATTTTAATAATCGTTTAAATATTAAATTATTAA -AACCTAAAGAATTTGATTTACTATTAAATCCTTGATTAACAGGTTTTAATGATGCTGATG -GTTATTTTTATCTAGGTTTTCAAAAACATAAAAATAGTCAATGATTAAAATTTCATTTAG -AATTATCACAAAAAGATAGTTATATTTTAGTCCGGCCCGCCCCCGCGGGGCGGACCCCAA -AGGAGATATTATTAAAAAATATTTTAAACTTGGTGGTATTTTAAAAAGAGATTATAAATC -TGGTGCTACAGCTTATATTTATAAAGCTCAATCATCAAAAGCTATAAAACCTTTTATTGA -ATATTTTAATAATTATCAACCATTAAGTCTTAGAAGATATAAACAATATTTATTATTAAA -TATTGCTTACTTATTAAAATTAAATAAATTACATATATTACTTAATTCTTTATTAATATT -AAAAGAATTAATATTATTACAAAGTGTTAAAAATATATCTTTAGAAATAAAAAATGAATT -AAATAATAGAGTTAAAATTATTATTAATAAACTTCATTATAACAATATCGAATAATGATA -ATATTAAAGAGTAAAATTCTTAAAGTGTTAATTAAATAATATTCTTTTTTTTTTATGACT -TACTACGTGGTGGGACATTTTCGTGCGGTCTGAAAGTTATCATAAATAATATTTACCATA -TAATAATGGATAAATTATATTTTTATCAATATAAGTCTAATTACAAGTGTATTAAAATGG -TAACATAAATATGCTAAGCTGTAATGACAAAAGTATCCATATTCTTGACAGTTATATTAT -AAAAAAAGATGAAGGAACTTTGACTGATCTAATATGCTCAACGAAAGTGAATCAAATGTT -ATAAAATTACTTACACCACTAATTGAAAACCTGTCTGATATTCAATTATTATTTATTATT -ATATAATTATATAATAATAAATAAAATGGTTGATGTTATGTATTGGAAATGAGCATACGA -TAAATCATATAACCATTAGTAATATAATTTGAGAGCTAAGTTAGATATTTACGTATTTAT -GATAAAACAGAATAAACCCTATAAATTATTATTATTAATAATAAAAAATAATAATAATAC -CAATATATATATTATTTAATTTATTATTATTATATTAATAAAATTTAATATATATTATAA -ATAATTATTGGATTAAGAAATATAATATTTTATAGAAATTTTCTTTATATTTAGAGGGTA -AAAGATTGTATAAAAAGCTAATGCCATATTGTAATGATATGGATAAGAATTATTATTCTA -AAGATGAAAATCTGCTAACTTATACTATAGGTGATATGCCTATCTTTATTTATATATATA -TTATTATTATTAATAATAAAAAAAAAAATTAAAAAAAAGATAGGAGGTTTATATATAACT -GATAAATATTTATTATATTATTTTTTTTTATAATAAATATTAAAAGATATTGCGTGAGCC -GTATGCGATGAAAGTCGCACGTACGGTTCTTACCGGGGGAAAACTTGTAAAGGTCTACCT -ATCGGGATACTATGTATTATCAATGGGTGCTATTTTCTCTTTATTTGCAGGATACTATTA -TTGAAGTCCTCAAATTTTAGGTTTAAACTATAATGAAAAATTAGCTCAAATTCAATTCTG -ATTAATTTTCATTGGGGCTAATGTTATTTTCTTCCCAATGCATTTTTTAGGTATTAATGG -TATGCCTAGAAGAATTCCTGATTATCCTGATGCTTTCGCAGGATGAAATTATGTCGCTTC -TATTGGTTCATTCATTGCACTATTATCATTATTCTTATTTATCTATATTTTATATGATCA -ATTAGTTAATGGATTAAACAATAAAGTTAATAATAAATCAGTTATTTATAATAAAGCACC -TGATTTTGTAGAATCTAATCTTATCTTTAATTTAAATACAGTTAAATCTTCATCTATCGA -ATTCTTATTAACTTCTCCACCAGCTGTACACTCATTTAATACACCAGCTGTACAATCTTA -AGTTATAAAATTTAATTATTTACTTAATAATTAAAAAGTAAATATTATATCTAAACTTAA -TAATATAATAATAATATTCTTATAAAAATATATAAAAAAAAATATATAAAATTTATTAAA -ATATCTCCTTTCGGGAACTATAATATATTTATATAAATAAATACTAATATAATCCTATTA -TATATATATATATATAAAATAATATATATATATAATTAATATAAATAATATTTATAATAA -TTTTTTAATAATATATATAATTTAATATATTAATGAATATTATATAATTATTAAATATAT -TATAATATTATTATTATTTTATAATAAAAATATTTTTAATACTAATTATTATTTATTATT -TATAAATATATAAATAGTATGTTTAATATTATTAATACTAAAAAAAATATAATTATAATT -AGGATCTAACAATACATTTATCTGATTAATATTAATATTAATATTAATATTTATATTAAT -AAACGGATTAAATTAATTGTATCCAATTTAATTAAATTATAGATATATTATTTATAATAT -TAATATATTGTTTTATTAAAAAGGTAAAAATAGTTTTTATTTTATATATAAATATAGGAT -ATAAATAAATATATTATAGTGAACCCCGAAAGGAGAATATATTAAGAATATATTTATATT -TTACATATAATTATTTATAATATAAATATCTCCGCAAAGCCGGATTAATGTAATTATTTA -ATAATTTTATTTAATAATTTATTAAAATAAATATTTACATTTGATAATATTTATATTATG -TCAGTTATTTTATATTAATGTTTAATCTATTATAATATTTTTTTTTATAAATATATTATT -TATTTATATTAATTATATATATATATTATTTTTATAATATATATATATTTTTATTAAATA -TTTATTAAATATTTATTAAATTATTATAATGTTGTTATTAATCTTATTAAAAAATATATA -TAAAAATGCCACAATTAGTTCCATTTTATTTTATGAATCAATTAACATATGGTTTCTTAT -TAATGATTCTATTATTAATTTTATTCTCACAATTCTTTTTACCTATGATCTTAAGATTAT -ATGTATCTAGATTATTTATTTCTAAATTATAATATATATTATTAATTTATTTATTCATAT -AAATATTATTATTATATATAAATATTAATAATATTTATACTTATTTAATAATAATAAAAT -AAAAAATAATTATAATTTAATATATTTAATATATTTCCTTACGGACTATATATTTATATA -TATATATTAAATACAATTTAATTTAATTTAATTATGTTATTTATTAAATAAAGTTATATT -ATGATATAATAACAATATTATATATTATTATATAATTATAATATATTTTAATATAATTAT -CAAAAGAAATAATAAAAAAATATTAATAAGAATATAATTTAATAATTATTAAAAAAAAAT -TCTTATAGTCCGGCCCGCCCCCCCCGCGGGGCGGACCCCAAAGGAGGAGTAATAAAAATT -ATTAAATACAAATATTATATATATATAATTCATTATATATATATATATATAATAATTAAT -CTTATTTTTTTATATATTTATTTATATATCTATTTATATTTTATATATATTTATTTATAT -ATCTAAGGGGTTCGGTCCCTCCCCCCGTAAGTATAATATACGGGGGTGGGTCCCTCACTA -TTTATATTTTTATTTTATATATTTTATATATTTATAAATAAAGTATAATAAGATATAATT -ATGATTAATTATTTATAAGTTATAGTTTTATAAATTTATAATTATTATGTTTAATTTATT -AAATACATATATTACATCACCATTAGATCAATTTGAGATTAGACTATTATTTGGTTTACA -ATCATCATTTATTGATTTAAGTTGTTTAAATTTAACAACATTTTCATTATATACTATTAT -TGTATTATTAGTTATTACAAGTTTATATCTATTAACTAATAATAATAATAAAATTATTGG -TTCAAGATGATTAATTTCACAAGAAGCTATTTATGATACTATTATAAATATGCTTAAAGG -ACAAATTGGAGGTAAAAATTGAGGTTTATATTTCCCTATGATCTTTACATTATTTATGTT -TATTTTTATTGCTAATTTAATTAGTATGATTCCATACTCATTTGCATTATCAGCTCATTT -AGTATTTATTATCTCTTTAAGTATTGTTATTTGATTAGGTAATACTATTTTAGGTTTATA -TAAACATGGTTGAGTATTCTTCTCATTATTCGTACCTGCTGGTACACCATTACCATTAGT -ACCTTTATTAGTTATTATTGAAACTTTATCTTATTTCGCTAGAGCTATTTCATTAGGTTT -AAGATTAGGTTCTAATATCTTAGCTGGTCATTTATTAATGGTTATTTTAGCTGGTTTACT -ATTTAATTTTATGTTAATTAATTTATTTACTTTAGTATTCGGTTTTGTACCTTTAGCTAT -GATCTTAGCCATTATGATGTTAGAATTCGCTATTGGTATCATTCAGGGATATGTCTGGGC -TATTTTAACAGCATCATATTTAAAAGATGCAGTATACTTACATTAAATTATAAAATAAAA -TTATAAAATAAAATAATTTACATATGGAGTATTAAACTATAATAAATACAATATACCCCA -TCCCCCCCTTTTAATAATATTCTTTTATCTAATAAAATATTTATTTATTAATATTATTAT -TATCTTCTTCAAGGACTTATTTAATATATTTAATAACTTATTATACTTATTTATATTTAT -AATTAATACAAATATATTATTAATCTTACTCCTTCGGAGTTCGGCCCCCCATAAGGGGGG -GACCTCACTCCTTCCCCACTGCACTGGATGCGGGGACTTATTTTTATTATTATTATTTAA -TCTTTATTTATAAAATTATATATTATATATAAATTATTATACTTAATAATTAAAAAAAAA -CCTCTAATTATTATTAATATTATATATAATATATATATTCTCATTAATGTTATATATAAT -ATATATATTCTCATTAATATATTAATATAGTATTAAAAAAAATAAAATATTTAATAAATA -TTATTATTAATAATATTTATTAAAAATAATATAACATAATAAATATAAGATTATTATATA -ATATATTTATTATATCATATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGGAAGG -AGAAATTATAACATATTTTTTAATAATATTCATATTTATTTTATATACAAATAAATATAT -TTATTTAGAATAATAAAAAAAAATAATAAATAAATATATTATTATCATTATTATACTTTA -TTCATTATTTATTATAATAATTATATATAACAATTATAATATATAATTATATTTTATATA -ATATTATATTAATATTTAATATATTTATTATTATTATTACTTCTATGGAAACTTTATATT -TTAGATATTTTTATTATTATTATTAATTTATAATGTTATATTTTTGATTTATAAATATAT -AAGTCCCGGTTTCTTACGAAACCGGGACCTCGGAGACGTAATAGGGGGAGGGGGTGGGTG -ATAATAACCAGAATATTCAATAAATACAGAGCACACATTAGATAAATTTTATAATATAAC -CAATATAAAATAAAATTAAAATAATTAATATATATATATAAATATAATAAATTATTATAT -ATAAATATATATAATTTTTATAATAAATATTATAATATTATATAAATAAATAATTATAAT -ATATAATAAATATATAATAATAATAAAAATATTAACAATATAATAAAAATTTATAATATA -AATATAAATTATAAATAAGTTAAATTAATAAAATAATAAATGATTAACAAGAAGATATCT -GGGGTCCCATTAATAATTATTATTTTCAATAATAATTGGGACCCCCCACCATTATAATAT -CATATTAATTAATATAATAATAATGTATATAAAATAGAAATAATAATTAATATAATAATA -ATAATATATATAAAATAGAAATAATAATTAAATATATATATAAATAATTATTTATATAAT -ATATTATAAATAATAATAATAATAAATATTTATTAATTAATAATGATTATAAATATTTTA -TTTAATATAAATTTATAACTATTTTATTATATATATATTTTTTATTCATAAAAATTCCTT -TTGAGGATTTTTATTTTATATAAATATCTTCTAATATTTATAATAAATAATAATATATTC -ATTATATTTATAATTATATATAATGTAATACGGGTAAACATTACCCGTTGTTCACGGGTA -ATGTTTACCCTATTTTATATAATTCTTAATAAATATATTTATATTTTTATATAAAAAAAA -TTATAATAATTTATTAATTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGAACTCCGGAA -CTATAAAAATAATTTTAATATAATTTATATATTTTATGATTAATATAATATATTATTAAT -GTAACTCCTTCGGGATTTGGTCCCCCTCGTAAGTATATAGTATATAGTATATAGTATACG -GGGGGTCCCTCACTCCTTCGGGGTTCGGTCCTCCCTTACGGGTACGGATACGGATACGAA -TATGGGGAGTCCCTCACTCCTTATCACTACGCTGAAGGTGGAATTTATTTTATATTATTA -TTAAATCTTTATTTATTTAATTATATATTTAATATATATATTATTATAATAAAACACCTA -ATTATTATTAATGTTATATTTAATATAATATATATATTCTTAAAAATTTATATAATATAA -ATAAATAAAAAAAAAAGAAAGTACATAATTAATATTATTATAAATAATATTATTAAAAAG -AATATAATATAATTAATAGAAAGACGTTTTAAAAATAAAAATAAAAATAAAAATAAAAAT -AAAAATAAAAATAAAAATAAAAATAAAAGAGTTTTGGTTTACATATCAAGACCCAATTCA -ATTGAAACTATTTATTTATTAATCTCCTCCCCTCCCCCTCACTATTATTATAAGTACAAT -TAGGGCGCCAACCCCGCAGTGTTATTTACTGGGAAATGTTTATCCCAATTAATATAATAA -CGAGAGTTATTAATTATTATTTATAAATTCATATAATGTAATATAATGTAATGTAATTAA -TAGAACATTATTGTGTTATTCACCAGTGTTAAGATATATTAATCCCAATTTTATTTAATA -GTGAAGATTATATTTTATTAATTATGAATCCATATTATTATTATTTAATATATTTATAAT -ATTATATATAATTATAATTATAAATAATTTATATAAAAAAAGTTTTATTAAAAAATATTA -TTAAAAATATAATATTAATAATAAATAAAAATAATATTATACTCTTAATAGAATTTATAA -TGATAAAAATTAAGATGAAGACTTTTTTTTATAATTATTATAAATTTATATAAAAATAAT -ATATATATATTTATATTTATTTTATTAATATATATAATATATTTATGTATATTAAAAAGA -TATATTTAAATATTTTTATTTTTTTTTTATAAGATAATTTTTGTAAATATATAAGTAATA -AATTAAGTTTTATAGGGGGAGGGGGTGGGTGATTAGAAACTTAACTGAATAATATATATA -AAGCATACATTAGTTAATATTTAATAATATAATCAATATATAATAATTATAAAATAATTA -ATTATATAATAATAATAATGTATAAACAATATAATAAATTGTATAAAATAAAATATAAAT -CATAAATAAAGCTAAATTAATAAAATAATAAATGATAAACAAGAAGATATCCGGGTCCCA -ATAATAATTATTATTGAAAATAATAATTGGGACCCCATATAGAATATAAATAATTAAATA -TATATATATAAATAATAATTTATATAATATATTATAAATAAATAATAATAAATATTATTA -ATCTATAATAATTATAAATATTTTATTAATATAAATTTAATAATTATATATATTTTTATA -ATAACTCCGAAAGAGTAAGGAGATATTAATTTCTTATAAAAATTTATTAATAATAATAAT -ATATAAAATATATAAATAATATATTATATATAAAATAAAATAAAATAAATAATATATTAA -AAATATTGAAAGTATTTTAATAAATAATAAATTTAAAATTCATATTTATAATAATAAATA -AATAAATAAATAAATAAGTAAATATTTAGATTCTCATTAATATTAATATTTATATTTCTT -TTTTTTTATAATAATAAAAATATCATATATAAATATAATATAATATAATATAATAAATTA -TTATATATAAATAATAAATATTAAATATAATATATAATAATATATAATCTTACAATTTAT -AATTTAATAAAGAAGGAAATAAATAATAATAACTCCTTTTGGGGTTCCGGTGGGGTTCAC -ACCTTTATAAATAATAAATAAAGATGTTTACTCCTCTTCGGGGTTCCGGTCCCCTTTTTG -GGTTCCGGAACTAATTAATATTTTATATAATAATAATAATATATTAATATAATTTCATTA -TTAATAAATATCTCCTGCGGGGTTCGGTTCCCCCCCGTAAGGGGGGGGTCCCTCACTCCT -TCGGAGCGTACTATTATTATAAATAATTATATATTATAATATAATTAAAAAGTATTATAA -TTGAAACGAAAATTGTAATTTTAAATGGAATAATAATTATTATATATTTAATATATTTAA -TAAAGTTATAATATCTCTTTCTACCGGACTATTTTATTTTATTTTATTTTATTTTTATAA -AGAAAAATAGTAATAATATTATCTTCTCCTCCTTTCGGGGTTCCGGTTCCCGTGCCGGGC -CCCGGAACTATTAATTATATAATATAATATAATATAATATAATATAATATGATACGGATC -AAACATTACCCGTTGTTCACTGGCAATGTTTAATCCTATTGTATATAAATATAATAAAAT -AATTATCCCTCTCGTAATACATATATAAAATATAAAATATAAAATAAAAATATTATGATT -ATTATAATATATATATATATATATATAAATATATATATATAATTTATAATTTATATGATT -AATATATTATATATATAAAAAATATATTAAATTTACTTTTTATAGAAAGGAGTGAGGGAC -CCCCCCCCCTTACGGGGGGGAACCGAACCCCGCAGGAGATATTTATTTTAATACTTATAT -AGTATTTATTAATAATATAATAATTGTTATTATAAATATTAATAATAATATAAAAATAGG -GTAAATAATATAAATAATATGAATAAATATAAAAACATATTAAATATAAAATATATCATA -AATTTAATAAATATTATAATAATTTATAAATGATAGATATCTGGGGTCCTATAAATAATA -ATTATTTTCAATAATTATAGGGACCCCCACCTATTATATAAATATAAATATAAATATAAA -TATAAATACAAATATAAATATATAAATATATAAATATAATATAAATACAAATATAATATA -TAAATATAAATATAAATATATAAATATAAGTCCCCGCCCCGGCGGGGACCCCGAAGGAGT -GAGGGACCCCTCCCTATACTAATGGGAGGGGGACCGAACCCCGAAGGAGTATAAATAAAA -ATTAATAATATATATATAATTATAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCC -GAAAGGAGAAATAATAATATAATATATAATAAAATATAACTTATTAATATAATATTAAAA -ATATAATTAACAAGAATAAATAGTCCGTGGGATCGAACCCCCTTTTTTATTTAATATTTA -ATATTTAAAGAAGGAATTGTTTATATATATTAATATCTTATTTGGGGATTAATATAATAT -ATAAGTTTTGGATACCAGGCCAAAGACCGGAATCCCAAAAGGAGATTATATAAATATTAT -TTATCTCCCTTTTTTAATATTATAATAATTTTATTAAAAATAAAATAATAATAATAATTA -TAATTTATAATAACAATTATAATAATTTAATTAATTAATTAATTAATTAATTAATTAATT -AATTAATTAATAATAAATATAAATATAAAAAGAATATAATTTATAATAAATAAATTTATA -TATATATATATATATTAAATAAAATATTTACTTCATTAATATAAAATATAAATATATTTA -ATTAATAAGTATATATATATAATAATATATAATAACCTATTTATATATATAATCTTAATA -TAATTATAAGAAATATTATATAAGTAATATATAAAAATAATATAAAATAATTATAATTCA -ATTTATATATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGGAAT -AAGATAAATATATAAATTATATTAATAAATATAAATTTTAAATGAATTAATAAAATTAAT -ATATATATGTATATATATATATATATTAAAAATATTTAATTATTTTTAGGAAGGAGTGAT -AGATCCCTTTGGGGGACCGAACCCCTATTTAAGAAGGAGTGCGGGACCCCGTGGGAACCG -AACCCCTTTTTTATTTAAAGAAGAAGTTTTATTTTATTTTATTTTATTTTATTTTATTTT -ATTTTATTTTATTTTATTTTATTTTATTTAATTTAATTTTAATTAGGTTAATAAATAGTA -ATAATAAACTTAATAATAATAATAATAATTTTATTTTTATAATTTATTAATAATAATAAT -AATTATATATATATATATTATTAATAAATATAGACCTTATCGTCTAATGGTTACGACATC -ACCTCTTCATGTTGATAATATCGGTTCGATTCCGATTAAGGTTATTCATAATAATAAATA -TTTGTAAAAAAAGTATATATAATTAAACATATTCTTTATATTAATTAATAATTATTAATA -ATATACATTTTATATAATACAATTATATATATATATATATTTTTTTTTAATACAAATAAT -ATATTCATAATAATAAATACCGATTGTTATTATACTATAATAAAATATATAATATATTTT -TCATTATAATATTTTTAAATAAATATTATAATAAATTATATAAATAATATTTATGTATAA -TAATAATAATAATAATTGTTATTAATTAATTCTATAATTATTATATATTTAATTTTTTTT -TTTAATATAATATATAATAATATAATTTATTTTATTTTTTTTTATAGTTCCGGGGCCCGG -TCACGGGAGCCGGAACCCCGAAAGGAGAATATAAATTAATAATAATATAAATAACATATT -AACAATAAATTATTGTTAATATAATAATAATAATAACAATATTAATAAATAATATAAAAA -TTATTAATATTATATTTATATAATATTAATATAAAAATCTTTCATAATATTAATTATTAT -TAAATAATAATGATATCATTAATATTAATATAATCGTCAATATTATTTATTTATTTATTT -ATTTATTTATTTATTTATTTATTTATTATTAAATAAATATTTTTAAATATTATATTATAT -TATTAACTTTTTATTAAAAAAATTAATAATGATATAATATAATTAATATTATCCACGGGA -CCAATGACCAACCCAGTAGTTGACCGGATTGGCGCCCGCGAGGTTTATATTTAATAAATA -ATAATAATAATATTAATAAAATCTATTAACTTTTTTTTTTAATGGATTATATTAATGAAA -AAAAAATGAGAAATATCTTTTTTTTTTAATAATTATAATTTATATATAATAAAATATGTA -TATATAATAAAAAAATAGTTTTTAATATTATAATATAATTATATATATAATTATAAATAT -ATATATATATAATAAGTATTAATTAATAATATATATTTATATATTTTTTATTAATTAATA -TATATAAAATATTAGTAATAAATAATATTATTAATATTTTATAAATAAATAATAATAATA -TGGCATTTAGAAAATCAAATGTGTATTTAAGTTTAGTGAATAGTTATATTATTGATTCAC -CACAACCATCATCAATTAATTATTGATGAAATATGGGTTCATTATTAGGTTTATGTTTAG -TTATTCAAATTGTAACAGGTATTTTTATGGCTATGCATTATTCATCTAATATTGAATTAG -CTTTTTCATCTGTTGAACATATTATAAGAGATGTGCATAATGGTTATATTTTAAGATATT -TACATGCAAATGGTGCATCATTCTTTTTTATGGTAATGTTTATGCATATGGCTAAAGGTT -TATATTATGGTTCATATAGATCACCAAGAGTACTATTATGAAATGTAGGTGTTATTATTT -TCATTTTAACTATTGCTACAGCTTTTTTAGGTTATTGTTGTGTTTATGGACAGAGTGAGA -CAAGTATAAGTATATTATTATAATATCATACCATTAAATAAATTATTTTAATGAAATGAT -TATGTTTATATATAACATATACCTAATTAGACATGCATTATTAGTAATAATTTTGTATGA -AACTCTAATAATAATAATTATTATTAATTATTAAGGTAAGATTCATATGGATAGCGTAAG -TCAATCTAATATTATAAAATATCGTAACATAAACAATATTTTTTTCTATTATTAATTAAT -AAATAATAATAAATAAAAATAATTATATGAGAAGTAAGATATTCAATTCTGTCTAGAATA -CATATATATACGTTAATACTCATCGGTATAAAATTAGAATCCTAAGTGAATTATTGAAAG -TATAATAATATAAACTTGGTAAGCCCAATTATTTCCATATAATATTAATATAAATATTAT -ATGGTAGTTATATATAATATTATTAAATAAATAATAATAGAAATTATAATATAGATAAGT -GGGTAAAAGACTATTGAAAAAGCTAAAGATTATATGTAATGTATAATATAGATCAAATTA -TTTATATATTTTAATAAAAATATATTAATAATGGTTAATATTATTATTAATTAATTAATT -AATTAATTAATAATAATAACGAATAAATGATTAATGTGAAAGCATGCTAACTTCAATATA -GGATGATTTATATAGTATATAAATTGTTTGAGCTGTATACTATGAAAGTAGTACGTACAG -TTCTGAGTGGGGGAAAATTTGTAAAGATCTACCTATCACAATTGTCACATTGAGGTAATA -TAAATATCGCCTCAAATATATTTAATATAATAAAACTAATTTATATAATAATGTTAATAT -TATTAATTTATATTTTTTATACGATTATAATAAGACAAATAATAAAAACTAAAGAATATC -TTATATTAATTAAGAGTATAGATTATATTAATAAAAATAAATATATAATTAATTTAAATA -TAACAAATAAGAAAGATATAAATAATAATATTGGTCCATTAAATATAAACATTTTATCAA -TTATTTATGGTTCAATATTAGGAGATGGTCATGCTGAAAAAAGAAAAGGTGGTAAAGGAA -CAAGAATTGTATTTCAACAAGAATATTGTAATATTAATTATTTATATTATTTACATAGTT -TATTAGCTAATTTAGGTTATTGTAATACTAATTTACCTTTAATTAAAACTAGATTAGGTA -AAAAAGGTAAAATTAGACAATATTTAAAATTTAATACATGAACTTATGATTCATTTAATA -TGATTTATTCAGAATGGTATATTAAAAATATATCTGGAAAAGGTAATATTAAAGTTATTC -CTAAATCTTTAGACAATTATTTAACTCCTTTAGCTTTAGCTATTTGAATTATAGATGATG -GATGTAAATTAGGTAAAGGTTTAAAATTCACAACTAATTGTTTTAGTTATAAAGATGTTC -AATATTTACTTTATTTATTACATAATAAATATAATATTAAATCTACTATTCTTAAAGGCA -ATAAAGAAAATACACAATTTGTTATTTATGTATGAAAAGAATCTATACCTATTTTAACTA -AAATTGTATCTCCTTATATTATTCCTAGTATAAAATATAAATTAGGTAATTATTTATAAT -AAAATATATAGTATTATATTAATTATTATATTATTATAATGCGATATTATTGAAAACATG -TCAAAATTATATTATTAAGTAACAAGACAGTGGGTTATATAATTATATGATCCCAACAGA -ATACACCAATAATAGGTATTATTATAAAAAAAATAATAATATTTAATGTTTATTCGAAGA -AAATTTATAATATTATTATTATAACACAAGGTTTAATAATCTATATATATATATTATATA -TATAACTACTGTTATTATTCCATTTACCTAATTAATATATAAATAATGAATTATAATTAT -TATGATTAATATTTTTATAATAATAACCCCATCATAACATTTATATATAACATTTATATA -TAACATTTATATATAATATTTATATTATGGTATTATTAGGTATAAATATTTATTCATAAG -AGAAAATAGTGATTAAATGGAATTATAAAAAGGGTAGATATTATTAAATACAGGGTATTA -TTTATATTAATAAATCAATAAATATTGAGATTATTATTATTAAAAAATAATAATAATTTA -TAAATAATATTATTTTCTTGGCACTAGTTATTACTAATTTATTCTCAGCAATTCCATTTG -TAGGTAACGATATTGTATCTTGATTATGAGGTGGGTTTAATATAGAGGATCCATATTATA -GTAATATAATATTAAATAAATCTGTTTTATGCTGAAATATCTTCATTTGAATAATAAATT -ACTATATTATTCAATTAATTATTTATAATAATATAATTTGAAATAAAAATAATATAGTTA -AAATATTTATTATAAGAAGAAAATTAGCAGTAATTAATATATATATATATATAAAATTAA -TTATTCAGAGACTTTATAGTTATTATATAAATAATACTATTATTTATGATAAAAATCATA -AATTAAACACAGATAATCCTATTTATGCATATATTGGTGGTTTATTTGAAGGAGATGGTT -GAATTACTATTTCAAAAAAAGGTAAATATTTATTATATGAATTAGGTATTGAAATACATA -TTAGAGATATTCAATTATTATATAAAATTAAAAATATTTTAGGTATTGGTAAAGTAACAA -TTAAAAAATTAAAAATAAAAGATGGTACTATTAAAGAAATATGTAAATTTAATGTAAGAA -ATAAAAATCATTTAAAGAATATTATTATTCCTATTTTTGATAAATATCCTATATTAACTA -ATAAACATTATGATTATTTATATTTTAAAGATAATTTATTAAAAGATATTAAATATTATA -ATGATTTATCTTATTATTTACGTCCTATTAAACCATTTAATACTCTTGAAGATATTTTAA -ATAAAAATTATTTTTCTTCATGATTAATTGGTTTTTTTGAAGCTGAAAGTTGTTTTAGTA -TTTATAAACCTATAAATAAAAAAATAAAACTTGCTAGTTTTGAAGTATCTCAAAATAATA -GTATAGAAGTTATATTAGCTATTAAATCATATTTAAAAATTACTCAAAATATTTATACAG -ATAAATTTAATAATTCAAGAATAACACTTAAAAGTATTAATGGTATTAAAAATGTTGTAA -TATTTATTAATAATAACCCTATTAAATTATTAGGTTATAAAAAATTACAATATTTATTAT -TCTTAAAAGATTTACGTCTTATTCTTAAATATAATAATTATTTTAAAATTCCTCCTAAAT -ATTAATCTTATATAAAAATATAATAATAATATATTTATATATTATATAATTATATAAACA -AAATATAATTTATATATAATTATTTATTATAAATATAGTCCGGCCCGCCCCGCGGGGCGG -ACCCCGGAGGAGTGAGGGACCCCTCCCTATTCTAACGGGAGGGGGACCGAACCCCGAAGG -AGTTTAATTATATATTAAATATATTATTATCAATAAATAATTCCTTTGAACTATTTATTA -TTTTATTATATTTATTTTCTCCTTCATTATTAATTTTTATTAATAATTAAAATCTTATCA -TTTTATGGTATTTTTATTTCTATTTTAGGATATCGAAACTATAAATTAAAAAGTATAATT -TTATTAATTATAATTTATGATTAATAAATAAGAAATAAAAACTTTAGAAGTAATATTTAT -CTTTTTTTTTTATAAATAAATATTATGATTAATATATAATCATTTATAAATATTTATATA -TAATTATATATATACATAAATAGGATTAAGATATAGTCCGAACAATATAGTGATATATTG -ATAATAGTTTTCAAATATGTAACTATTTAAACATTAAAAGCTCAGTATCTAACCCTCTAA -TCCAGAGATTCTTTGCGTTACATTATTTAGTACCTTTTATCATTGCTGCAATGGTTATTA -TGCATTTAATGGCATTACATATTCATGGTTCATCTAATCCATTAGGTATTACAGGTAATT -TAGATAGAATTCCAATGCATTCATACTTTATTTTTAAAGATTTAGTAACTGTTTTCTTAT -TTATGTTAATTTTAGCATTATTTGTATTCTATTCACCTAATACTTTAGGTCAAAATATGG -CCTTATTATTAATTACATATGTAATTAATATTTTATGTGCTGTATGCTGGAAATCTTTAT -TTATTAAATATCAATGAAAAATTTATAATAAAACTCTATATTATTTTATTATTCAAAATA -TTTTAAATACAAAACAATTAAATAATTTCGTATTAAAATTTAATTGAACAAAGCAATATA -ATAAAATAAATATTGTAAGTGATTTATTTAATCCCAATAGAGTAAAATATTATTATAAAG -AAGATAATCAGCAGGTAACCAATATAAATTCTTCTAATACTCACTTAACGAGTAATAAAA -AGAATTTATTAGTAGATACTTCAGAGACTACACGCACACTAAAAAATAAATTTAATTATT -TATTAAATATTTTTAATATAAAAAAAATAAATCAAATTATTCTTAAAAGACATTATAGTA -TTTATAAAGATAGTAATATTAGATTTAACCAATGATTGGCCGGTTTAATTGACGGAGATG -GTTATTTTTGTATTACTAAAAATAAATATGCATCTTGTGAAATTCTTGTAGAATTAAAAG -ATGAAAAAATGTTAAGACAAATCCAAGATAAATTTGGTGGTTCTGTAAAATTAAGATCAG -GTGTTAAGGCTATTAGATATAGATTACAAAATAAAGAAGGTATAATTAAATTAATTAATG -CCGTTAATGGTAATATTCGTAATAGTAAAAGATTAGTACAATTTAATAAAGTATGTATTT -TATTAAATATCGATTTTAAAGAACCTATTAAATTAACTAAAGATAATGCTTGATTTATAG -GGTTCTTTGATGCTGATGGTACTATTAATTATTATTATTCCGGTAAATTAAAAATTAGAC -CTCAATTAACTATTAGCGTTACAAATAAATATTTACATGATGTTGAATACTATAGAGAAG -TATTTGGTGGTAATATTTATTTTGATAAAGCTAAAAATGGTTATTTTAAATGATCTATTA -ATAATAAAGAATTACATAATATTTTTTATCTTTATAATAAAAGTTGTCCTTCTAAATCTA -ATAAAGGTAAACGTTTATTTTTAATTGATAAATTTTATTATTTATATGATTTATTAGCTT -TTAAAGCACCTCATAATACTGCTTTATATAAAGCTTGATTAAAATTTAATGAAAAATGAA -ATAATAATTAAATTTTCTCCGTATTCATTATTATATTATCTAATTTATAAAATATTTAAA -GATTCCTTATAATAATATAACATCTTTGTAAATTATTGTTAAAGATAATATAAATTATTA -TGAATCGGTAGATTATATTTTTACAATCTTATTAAATAAAATTCTGATCATTAAACATGA -TTGAAGAAATAATAATAGTTTATGAAATAAGATAGTGTAATATAAATTTTTATGAAGATA -TAGTCCATTTTATATTTATTATAAAAGCATCCTGATAACTATATTCCTGGTAATCCTTTA -GTAACACCAGCATCTATTGATATTAAAAATATTAATAAAATTATTATTATTTAATCTTAT -TTATTTTATATAAAAAAAATAAATAATAATTATTAATAAAAATATATTATTTATTTCTCC -TTTCGGGGTTATTTATATATATTCCTTTATAATTTATATTTAATATATTATATTAAATAT -ATGAAAAATTATAATAAATAAATTAATTAATTAATAATAAATAATAATAAAAAGTACAGT -AGCATTAAATATTCTTAAGTTTCCGCTTTGTGGGAACTCCCATAAGGAGTTTAATGATTA -AAATTGGTTAATTGTCAAGAAAATCTAAGGTATTAATAAATAAATAATACTATGACAACT -TGCAGCGAAGTTTATATCATCTCTATATTATATATTAATATATATATATAATAATAATAA -TAATATTAATATAATATAAGATATAAAAACGTTCAACGACTAGAAAGTGAACTGAGATAG -TAATACCTTTCCACGAAAACCAATTAATTTATAAATTATTTTTAAATAAAGAATAGATTA -TTAATTTTTTTTATATAGTTCCGGGCCCCGGCCACGGGAGCCGGAACCCCGGAAGGAGTA -ATATATATTATATATAAAATAAAAAATATATATATATATATTATAAAATATCAAAAGTTT -TAATCTTTTATTATAAATTAATGACATAGTCTGAACAATAATGAAAATTATTGAGATAAG -ATATTAAATAATCTTATGTTAACATATATAAATTGTGTACCTGAATGATACTTATTACCA -TTCTATGCTATTTTAAGATCTATTCCTGATAAATTATTAGGAGTTATTCTAATGTTTGCA -GCTATTTTAGTATTATTAGTTTTACCATTTACTGATAGAAGTGTAGTAAGAGGTAATACT -TTTAAAGTATTATCTAAATTCTTCTTCTTTATCTTTGTATTCAATTTCGTATTATTAGGA -CAAATTGGAGCATGCCATGTAGAAGTACCTTATGTCTTAATGGGACAAATCGCTACATTT -ATCTACTTCGCTTATTTCTTAATTATTGTACCTGTTATCTCTACTATTGAAAATGTTTTA -TTCTATATCGGTAGAGTTAATAAATAATATATAATTAAATTAATACATAGATATAATATA -TATATTATTATTATTAATAATATAATAAAAATAAAAATAAAATTATTAATAATAATAATA -CTTTAATAATATTCTTAAAAATAATATATCTCTAATTTATAAAAATTAAATAATAATAAT -AAAAAAAAAATATTATAAAATATAAATTAATTAATAATGAAAATAATATACTTATTAAAT -TAATATAAATAAATGAATAATATAATATAACTATATTGAATTATAATCTATCTATCTTTT -TTTTTCATATAATTATAATATATATATTAATATATATAATTATTATTTTATATATTATAG -TTCCGGGGCCCGGTCACGGAAGCCGGAACCCCGCAAGGAGATTTATTAATTATTATTATC -ATTATTATTTTTTATTTAATCTTATTTATTATAAAATAATTAATTATCATAAAGCATAAT -TATTATAGAATCTTATTATTTTCTTTATTTAAATTTATAAAAATATAAAGTCCCCGCCCC -CTTTTTATTTTATTTAATTAAGAAGGTATTTTAAAAAAGGAGTGAGGGACCCCCTCCCGT -TAGGGAGGGGGACCGAACCCCGAAGGAGTACTCATTTAATATAAATATTAAATAAAAATT -ATTTTATATATATTAATGATTATTAATATTGATAATATAAATTATTTTATAATTAATTAT -TATAAATATATAACTATTAATAATTAATTTTTAATCTAGGGGTTTCCCCCACTTACATAA -ACTTACGTATACTTACATATACTTATGTATACTTACATATACTTACGTATACTTATATAT -ACTTATGTATACTTACGTATACTTACATATATGGGGGATCCCTCACTCCTCCGGCGTCCT -ACTCACCCTATTTATTAATCATTAATAAGAAATTATTATTAAAAAAATTATAATTTACTC -AAAGTTAATTATAAATATATTTTTAAATATCTATTTTATTAATCTTTTATAAAATTTAAA -TTAATTGTAATTAATTAATATTATAATAATTATTCTTAGGAAGGATATTTATTTATTTTA -ATTATGAATTCCTGACATAGAGACAATTAATTAGAACTTCTTATTATTATTATAGTAATA -ATAAAAATATTCTAAATATATTATATATATTATTATTTTTTTTATTATTAATAAAATATT -ATAATAAATTTAAATAAGTTTATAATTTTTGATAAGTATTGTTATATTTTTTATTTCCAA -ATATATAAGTCCCGGTTTCTTACGAAACCGGGACCTCGGAGACGTAATAGGGGGAGGGGG -TGGGTGATAAGAACCAAACTATTCAATAAATATAGAGCACACATTAGTTAATATTTAATA -ATATAACTAATATATAATAATTATAAAATAATTAATTATATAATATAATATAAAGTCCCC -GCCCCGGCGGGGACCCCAAAGGAGTATTAACAATATAATATATTGTATAAAATAAATTAT -AAATATTAAATAAAAACCAAATAAATAATATAATAAATGATAAACAAGAAGATATCCGGG -TCCCAATAATAATTATTATTGAAAATAATAATTGGGACCCCCATCTAAAATATATATATA -ACTAATAATATATTATATATATTAATATATAATAATATTATTAAAATATAATATTATTAA -AAAAAAAGTATATATAAAATAAGATATATATATATAAATATATATATTCTTAATAAATAT -TATATATAATAATAATAAATTATTTCATAATAAATTATTTCTTTTTATTAATAAAAATTA -CTTATCTCCTTCGACCGGACTATTAAATATTAAATATTTAATATTTAATATTTAATATTT -TATTCTATAGATATTCATATGAAAAATAATAAGTATATAATTATGATAATGAATATATTT -TTATTTATAATTTATTATTATAAAAATATTTTAATTTAATAATAATAATAAATCATTATA -TTAATTCTTTTAAGAATTTATAATTGTCATTATTTATTATATACTCCTTATTAAAAGGGA -TTCGGTTTCCCTCATCCTCATGGGTATCCCTCACTCCTTCTGATAATTAATTTTATAATA -ATAATAAAATAAACTTAATTAAATATTATATATTTATTTACAATTATATATATATATTAC -TCATAATTAAATTAAATTAAGATGCAATTCAATACGGTTGTATTATATTATTCATCAAAT -ATTGTTAATATTGATACCTACAGAGATATTTAATATTTTTATTATTATTATCCATTACTT -TTTTTATTATATTTTAATTATTTATTTATTTATTTATTTATAATAATAATATTTCATATT -ATCAATTATTATTTTTTTTTTTTATAATATATAATTAAATTATTTATATAGTTCCCCGAA -AGGAGAATAAATAAAATATTATATAAATATTTATATCTTTATTAATATTAATATAAGTAA -TATATATAGTTTATGATATTTAATTTTATCATAATATAATAATAATTATATAAATCTTAT -ACACATTTATATAAGTATATATATATATTATTAATATAATGAACATCTATTAAATAAAAT -AATTGTAAATCTCAAGTAAATTATTATTATTTTATTTTTAATAATAATTTATGATTTATA -ATTAATAAATAAAAGAGTAATTATATGATAAAAAAGGTAATAAATAAAATTTATAGTTCC -GGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGTTTATTTATATATATATATATATG -AATTAATATTTAATAATAAATAATAATATAATTAATAATATTATTATTATTATAATTTTT -TTATTTATAATATTAATAAAATATTATTATATATATATTATAATAATATTAATAAGATAT -ATAAATAAGTCCCTTTTTTTTTATTTAAAATAAAGAAAGAATAATTAAATAATATTTTAA -TAATTTAATTAAATAGTGTATTAAAAGATAATAAAAAGTAATATTAATATGTTAATTATA -TATAATATATTTATATATAATTATATATATATATATAAATAATAATAAATATATATATAA -TATAAAAATAAGAATAGATTAAATATTTAATAAATAAATATTATGCAATTAGTATTAGCA -GCTAAATATATTGGAGCAGGTATCTCAACAATTGGTTTATTAGGAGCAGGTATTGGTATT -GCTATCGTATTCGCAGCTTTAATTAATGGTGTATCAAGAAACCCATCAATTAAAGACCTA -GTATTCCCTATGGCTATTTTAGGTTTCGCCTTATCAGAAGCTACAGGTTTATTCTGTTTA -ATGGTTTCATTCTTATTATTATTCGGTGTATAATATATATAATATATTATAAATAAATAA -AAAATAATGAAATTAATAAAAAAATAAAATAAAATAAAATCTCATTTGATTAAATTAATA -ACATTCTTATAATTATATAATTATTATAAAATATATAAATATTATAATAATAATAATATA -TATAAATTATAATAAAAAATAATAATAATATATAATATACCTTTTTTTTAATATATTAAT -ATATAAATAAATAAATAATGGATAATATATAATTACTTTTTTTATATTATTAATAATAAT -AATTTATAAATATTGTTATAATAAACATTTATATAAATAAATATAAATTACCATAATAAG -ATATATTATTTATTAATAATAAAAATATTTATTAATAAATAAGAAATATATATATTATGA -TAATATTTATTAATAAATAATAAATTCTTTATATATAAATATATTAAATATATTTAATTG -AACACAATATAATTTTTATTGTATTATTCATTTAATAATATTAATATTAATATTAATATA -ATATTAGTGAACATCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGGCCCCGGAACTATTA -ATATTTAATAAAATATATATAATTTATAATTTTCATATAATTAATATAATAATTAGGTTT -ATAAATAAATTATAATATATTATAACAATATAATAAAATATATTATAAATCTATCTATCT -ATCTATATAATATATAAATTTATATATACATTAATAATATTTAATTATAATTATTTAAAT -ATTTAATTTATTAATATTCCCCGCGGGCGCCAATCCGGTTGTTCACCGGATTGGTCCCGC -GGGGTTTATATTATTTAAATATTAAATATTAAATAATAATTTATATTATATTAATAAATA -TAATAAATTAAAAATATATGATTAATTATATAATAATAATAATAATTATTTTAATATTAT -AATTTATAAAATTAATTATATTAATTATATTAATTCTTATTATATAATAATTATTAATAA -TAATTTATTTTAAGAAAGGAGTGAGGGACCCCCTCCCGTTAGGGAGGGGGACCGAACCCC -GAAGGAGAAAATAAATTAATAAAAGTTTAAAAGTTCTTATATTAATAATTATATAATATT -ATATTAAAGATTTTTATAATATATATATATAATATATTTATAGTTCCGGGGCCCGGCCAC -GGGAGCCGGAACCCCGAAAGGAGTTTATTTAATATTTATATTTATATTAATATTTATATT -TATATTTATATTCCTCTTAAGGATGGTTGACTGAGTGGTTTAAAGTGTGATATTTGAGCT -ATCATTAGTCTTTATTGGCTACGTAGGTTCAAATCCTACATCATCCGTAATAATACATAT -ATATAATAATAATTTTAATATTATTCCTATAAAAATAAAATAAATAAATAAATAATAATA -ATTAATTAATTAATTAATTTTAATAAATATAAAATATATAAAAATAATAATAATAATAAT -TATTATTTTAATAATATTATTTATATAATAGTCCGGTCCGACCCTTTTTATTCTTAAGAA -GGGATTTTATTTTATTAATTAATAATAATATATTAAAAATTATAAATAATTAATAATTCT -TTATATTTATATATATATATATATATTTATATATTTATATATATATTTTAATAATATTAT -GATATATTTTATTTTAATAATATTTTTATTTTTATATATAAAATTATAATATTTTATTTT -ATAAATTATTTATATATAAATTATTAATAATAATTATTTTTTTTTATTTGGGATTTATAT -TATTATTATAAAGAATATAATGTTATTAATAACTGCAAAAAATATCTAATATATTATTAT -TTATAATAATAAATAATATTATAATAAGGATGCATATTATATATATATATATATTTCTAT -TTATATTAATATTAATATTAATATGTATATATAATAGATAAAAAGTAAAAATAAAAAATA -ATGAAATTAAAATTATTAAATATAATTTTATCAATAATAAATAAACTTAATAATAATAAT -AATATTATTATTAATAATCTATTAGATTCATTAATAAATAAGAAATTATTATTAAAGAAT -ATATTATTAGATATAAATAATAAAAAAATAAATAATATAAAAAGAATATTAAATAATAAT -AATATAAACCCCGCGGGCGCCAATCCGGTTGTTCACCGGATTGGTCCCGCGGGGAATATT -AATAATAAATTACAACATTTAAATAATATAAATAATTGAAATCTACAAATTTATAATTAT -AATAAAAATATAGAAATTATAAATACTATAAATGATAAATTAATTAATAAATTATTATAT -AAAATAATAACTTTAAAATTAAATAATATAAATATTAATAAAATTATTATAAGTAAACTT -ATTAATCAACATAGTTTAAATAAATTAAATATTAAATTTTATTATTATAATAATGATATT -AATAATAATAATAATAATAATAATAATAATTATTATATAAATATAATAAATAAATTAATA -AATATTATAAATAATAATATAAATAATAATTTATGTAATATTTTAAGTTATTATTATAAA -AAAAAAGTAACTATTGAACCTATTAAATTATCATATATTTATTTAAATAGTGATATTTTT -AGTAAATATATTAGTTTAAATGATATAGATAAATATAATAATGGTATCTTAACTAATTAT -CAACGTATATTAAATAATATTATGCCTAAATTAAATGATCATAATATTTCTATAAATTAT -ATTAATAATATTAATAATATTAATAATAATAAATATAATAATATAATTAATTTATTAAAT -AATAATAATAATATTAATAATAATAATAATTATAATAATAATAATAATAATTATATTGGT -AATATTAATAATATTTATAATAATATAACTATTGATAATATTCCTATAGATATTTTAATA -TATAAATATTTAGTTGGTTGATCTATTAAATTTAAAGGTAGATTAAGTAATAATAATGGT -AGAACTAGTACACTTAATTTATTAAATGGTACTTTTAATAATAAAAAATATTTATGAAGT -AATATTAATAATAATTATAAATTAAATTATATCCCTTCTAATCATAATTTATATAATAAT -TCTAATATTAATAAAAATGGTAAATATAATATTAAAGTTAAATTAAACTTTATTTAATAT -ATATATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGAAATAAAA -TAAATATAATAAATAAAATAAATAAATAAATAATATATATATATATATAAATATATAAAA -TAATATTTACTTTTTATATATATATAATTATATATAAATAAAATATAATATAATATCATA -TAATTATATAAAAATAAAATTATAATTTATTTATATTAAAAATATTAATTAATTAATTTT -TTTATATAATTATTATAATAATAATTTAATTAAAAATAAATATCAAATAAAATTATAAAT -TAATCCTACTTTTGGATCCTATTTATATTTTATTATTATAAATAATTATTATTGATAGTT -AATTAAATAAAAATATATATATATATTACTCCTTCGGGGTCCGCCCCGCAGGGGGCGGGC -CGGACTATTATAATTATTATTAATATATTAATTATTAAATTATATAAACCGCCCCCGCGG -GGGCGGTTAGTTATTTATATTAATATATTTTATATTAATATATAATACTCTTTTTTCTAT -TATATTTTAATATATAATATTAAAAAAAATAAATAAAATAATATTCTTAATTTTTATTCT -TTATCTTCTTTAACCAAACTCCTTCGGGGTTCGGTCCCCCTCCCATTAGGTTAGGGAGGG -GGTCCCTCACTCCTTCGGGGTCCGCCCCCCCCCGCGGGGGCGGGCCGGACTATTTTAAAT -TTTAATTTAAATTTTATAAATATAATATTTAATTATAAATTTAATAATAATATATAAAAA -ATATATATATGGTTAATATATATAAAGATTATAATCTTTTTATTAAATAAAGGAAAATTT -ATTATATAATTTTTCTCTATAGTTATATATTTAAAACTTATTTTTTTTTTTTTATAAATA -ATAATTATAATAAATAATATTAATTATTTATTATATAATTAATTGGCCCCCATGCTGGGT -TCCGGAACTCCTCCTTCTCGCGAGGTTAACACCTATTATATAACTATAACTATAACTATA -ACTATAATTATAATTATAACTATAACTATAAATATTCATTTTAATAATAATAATAATAAT -AATATTAATATAAATAGTCGAAGAATATATTTATTTATTTTAATATAAATAAAAAGTTTC -AATTAATTTGAATTTGGAATTAAATTATTACTTCATATGGGGTTATGGATTTCGTTCGGA -ACTCCTCCCTCCTACCTCTATTTATTAATCATAAATCATAAATTATTATTAATTAATAAT -AATAATTTACTCGAGGTTCATACCTATTTTAATATTAATATTAATATTGATAAAATATAT -ATTCACTAAAAAGTATATAATTTACTCAATTTATACTATAATTTTATATTTTTTTATTAT -AATTTAATTATTTCAAATAAAGTAATTATAATAATATATATCCTTTATTAAATATATATT -AATTAATATATATATAAAAAGTAAATATTATTAATTGTATATAATTATAAATAATTAATA -TTTATTAAAATATATATAATTTATAATCCTCATATAATTAATATAATAAATAATATAACA -CAATGTAATTTAATTTAATTACATAATAAATTTATTATTATTATAATTATTATTTATTTA -TTTATTTATTATTATAAATTATAAATATTATTATAATTAAAATCAATTATTAATTATTAA -TGATAAATAATTAATGATAAATTATCAATAACCAATTAGATTATTTATCGATATTTAATT -ATATTATATTATATTATATTATATATATATATATATATTATATTATAAAATTTATTTATA -AATATTTGTTTATTTATTTATTTATTGAATAACAATAGAATTAAATATTGTCAATAAATA -ATAAATAATGTTTAATATATATTATATTATATTAATATTAATATTATTATTATTTTTTTT -ATTATATTAATATAATTTATAAAAATATAAAATTATTATTTTTATTATAATTTATATATA -TATAATATATATATTTATTAAAATATTTTAAGAAAGGAGAAAAATAATTAAATTAAATTA -AATTAAATTATTTATTATTATTATTATTATTTATATAATAATATATTATTTAAATATTTA -TATATTATTTTTATATTAATATTTATAGATGGGGGGTCCCTATTATTATTGAAAATAATA -ATTATTAATGGACCCCAGATAGCTTCTTGTTTATCATTTATATATATATATATATTATTA -ATTATTTTATTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGGCCCCGGAACTTTATAAT -ATTATTATTAATTATTTAATTAATATTATAATCATATAATTTAATATTTTATTTAATTTT -ATTAAAATTTAATATATATATTTTTATTATTATTTAATTAATTTATAAATATAAAATATT -CTTAATATTAAAAATAAATAAATAATAAAGTTTATAAATCATATATTATAATTATTTATT -ATTTTTATATTATATTAATAAAATATTATTATTATAAAAAAAAATAGAAATTTTATAATA -TTTTTATATATTTTTAATTATTATTATTAATATTTATTAAAGGAAATATAAAAACCGAAG -GAATATTATAATTATAATTATAATTATTATTATATTTAATTTATTATTATAATAATAATT -ATAGTCTGCCCCCTCTTTATCTTTATTTTAAAGTTCCGGGGCCCGGCTACGGGAGCCGGA -ACCCCGAAAGGAGAAGGATATTTAATAATTTATAATATTTAATTCATATATATATATATA -TATTTTATTTTTTATATATATATTTAATATATTATATTTATATTTATATTATTATTATAT -TTATATTATATTATTTAATTATTTTTTAATAATATATTATTAATATTTTACCTTTTGATA -AATAAAAATTTATTAAAAATTTTATAATAAGTATTAAAATATCATAAAAGTATAATATTT -ATATAAAATGTATAAATTTATAATCTTCTAATTAAATTAAATTAAATAAATAAAATAAAA -TAAATTAAACTCCTTTTGAGATTCACACCTATTTTATTAAAAATAGGTATTCACTTAATT -AAATTAAATTAAATTAAATTAAATTATGGATAATTTATTTAATAAATATATATATTAATT -ATAAAATAATAGTCCGGCCCGCCCCGCGGGGCGGACCCCGAAAGAGTCTGCCCCTTTTTA -TTTAATATTTAATATTTAATATTTAATATTTAATATTTAATATTTAAAGAAGGATATATT -TATAATTTATCATAATATTATTTAATAAGAAATTATTAATTAATTAATTAATTAATTTAT -TTATTGTTTATATTTATTAATATTAATATAATAAAAATGTAAAATACTTAATATTATTAA -TATTATTATATATAATATATATAATAATATATTATATTTATATCTCCTTTATTCCTTTTT -CCCCCGATGGGGACTTATTATATTATATTATTATATATTTCTTCGATAACTTTATATATA -TTTTATTTTTATAAAAAAATATTTATATATTATTATTTACAATAATAATTATTAATAGTC -CGGCCCGTCCCGCGGGGGGGAACCGAAGGAGTGCGGGACCCCGTGGGAACCGCATCCCTT -TTTATTTTTAATTAAGAAGGAGTGAGGGACCCCGTGGGGACCGAACCCCGAAGGAGTCTT -TTTTCTATTTATTAATAATAACTATAAATTATATTTAAAATAATAATTTACTTGTTATAA -TCTTAATGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGAAGTATATAAAT -ATTTACTTGTTATAATTTATTATATATTTATAACCTCCTTCTTAAAATTATCTTTACTTT -ATAATAAAAATTAATATAATATAATCTGATAATAATCGAATTTTATTATATTTAATTTAA -TTAATAATAGACAATTATTATTATTATTTTACTTATTAATATTAATTTAGATTTATATAT -ATAAATATTATTAATTTTATATTAATTTTTTATTAATTATTTATTTTTATATTCATATTT -TTTATTAATATTATTTTTATTAATAACTTTTTAAATAATTATAAACTATATATTATTTAT -ATTTATATTTATAATAAATGAAACAATTATAATAAAAATTACAATTACAATTATATTATA -ATTATGATTACAATAGGGTTAAACATTACCTGTGAACAACTGGTAATGTTTAACCCGTAT -TATTATTTATTATATTATATATATATTAAAATATTAATATTAATATTAATATTATATTAT -ATTATATTATATTATATTATATTATATTATATTATATTTATAATTATATTATATTATATA -ATTTATATACTTTTATAATTCTTATTATTATTTATTTATTATTTATTTATTATTATTTAA -ATATATTATTATTATATATTAATAATATATATATTATTTTATATATTTTATTTAATATAA -ATTATTTATATTTTTATATTTTATTATGAGGGGGGGTCCCAATTATTATTTTCAATAATA -ATTTATCATGGGACCCGGATATCTTCTTGTTTATCATTTATTATTCTTATTATTTGGTTT -TTATTTAATATTTATAATTTATTTTATACAATTTATTATATTGTTTATACCTTATTATTA -TTATATAATATATTATATTATTATAATAATTTAATTAATTATATTATTAAATATTAACTA -ATGTGTGCTCTATATATATTATTCATTCTAGTTTCTAATCACCCACCCCCTCCCCCTATT -ACTTATATATCTAGAAATAAAAATACATAACATATATTTTAAATATATATATATAATTAT -ATAATAATTATTATATATAAAATATATATATATATAATATATATTTATAAAATAATAATA -ATAAATATTATTACTCCATTAGAGGTTTTGGTCCCATATCAGGAACCGAAACTATAATAA -TATATAATATTATAATAAAGATATTCTTATTTATAATATATTATTAAATAAATTAATAAT -AATTATAATATATATATATAATATATTATAATATATTTATTCGAGAACTTTTTATTTATT -ATAAAATAAAATATTTTATTTATTATTTAGTTTTTTTTTATTAAACATTTTATAAAAATA -TAAATGTTAATAATATTATGATTAATAAGTAATAATAAATTTATTTATTTTTATTAATTA -CTTCTTCGAGGTATTAGTATCAGTATCAGTATCAGTATCGTAAAAAACGGGTGACTAAAA -TATATATATATATAAAATTATAAATAAAAATATTATAATAATTTTAAATAAATAAATATC -AATATATTATTATTATTTATATTATAATAAATATTATCTAATAATAGTCCGGCCCGCCCC -CGCGGGGCGGACCCCGAAGGAGTCCGAACCCCTTTTTTATTTAATTTTATTTAAAGAAGG -AGTGAGGGACCCCTCCCGTTAGGGAGGGGGACCGAACCCCGAAGGAGATAATTAGATATA -ATTATATTTTATTTTATATAAATTATATAATATTATATAATAATAATTATATAATAAGTT -AATAATAATTATATAATAAGTTAATAATAATCATATCTCCTTTATAAATGAACTTTTATT -AAATATATTTTATTAAATATTAAATATATTTTTTATAATATTAAATATATTTTATTAAAA -TATTTAATATATTTTATTAAATATTAAATATATTTTATTAAATATTAAATATAAATAAAG -GTTTATATTATAATTCATTATTTATATCTTCTTTATAAATTAATATTCGTATTAGATCCT -TATTTAATTTATAATCCTTTAAAAAACTTTTAATAAATATAATATAATATATATATAATT -TTTATTATTTTTATATTATTTTTATTATTTAAATATATTATATATTTCATTATAATAATT -ATTTAAAAAGTTATTTAATAAATAATCTGATATTATATTTTATAATTAATTTTATTTATT -TTATTTATTATATATATTATTATATATAATTAAAATTATAATTACAATTATAACTATAAT -TAAATTAAATTAAATTAAATTGGATTAAATTAAATTAAATTGGGCGCCAAGCCGGTTGTT -CACCGACTTGGTCCCAATATAATATGAGATAATATAATATACTATATGATATAACATAAA -TATAATATATTATATGATATAACATAAATATAATATACTCCTTCGGGGTCCGCCCCCGCG -TGGGCGGACCGGACTATATGAATATATTATTATTATAATTATAAAATTATAATAAATAAA -TAAAATTTCTTTAATAATTATTAATTAATATTATTAATTTATTTACAAATATTTTATTAA -TTTTTATTTTTATTAAATATAAATATATAAATATATATATATTTATTTATAATATTATTT -ATATTTATTATATATTATTATTAAATATATTTTTATTATATATCATTAAATATTAATATG -TTATTATAGTGGTGGGGGTCCCAATTATTATTTTCAATAATAATTATTATTGGGACCCCG -GATATCTTCTTGTTAATCAATTATTATATTATTTAATTTATTTTATTTCTTATTTATAAT -TTATATTATATAATTTATTATATTGTTAATACTCCTTCGGGGTCCCCGCCGGGGCGGGGA -CTTTTATTTATATTATTAATTATATTATATTATTATAATATATTTAATTGATTATATTAT -AAAATTATAACTAATGTATGCTTTGTATTTATTGAATAGTTTGGTTCTTATCACCCACCC -CCTCCCCCTATTACTTCTCCGAGGTCCCGGTTTCGTAAGAAACCGGGACTTATATATTTG -GTAATTAAAAATATAACTTATATAAATATTTAATAAATATATATTAAATATATTATTATT -AATAATTTATTATTATATAAAAAAATAATAAATATTATTAATGATTTAAATTATATAAAT -ATTAATTATTAAATAAATAATTATACTTTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGG -GCCCCGGAACTTTAAAATAATATATATATATATAAAAGTATTTTATAATAATTAGTTTAA -TTATTATTCTTTTTTTTTATTAAATATAAAATCATTTTAGGTTATTAATTTTTATTTATT -AAAAATAAATTTTATAATTAATATTTCTCCTTTCTTAAAATAAATAATATTATTATTATA -ATTATTAATTAATGAATACTCTTCTCTTTTGGGGTTCGGTCCACCCTCCCGTATACTTAC -GGGAGGGGGGTCCCTCACTCCTTTTGAGACTTTAATTTTATAAATATAAATATAAATATA -ATAAGATGTTAACTCTTTTATAAATAAATAATAAATATAATTCTATTTTTAATAATAATA -TATAATATTTTTATAATAAAATATATAAATAATAATATTTATATATATATATATACTTTT -TTTTATATAAGAATAATATATATAGTTCACATTGGAGGCGAGTAAAAGGAGATAAGAAAT -ATAATATAATATAATAATAAAAATATAATGAATAATAATAATAAAAATTTATATAATAAC -AAAATAGTCCGACCGAAGGAGATGAGATTATTAATATTATTAAATAATAAAATGTATTAA -TTATAAAATATAAAACCTATAAATAATTTATAATATAATTTATATTATGATAATAATAAT -ATATATATTATAATATTTTATATATATATTTATTATATTTATATTTATATAAAAAAGTGA -TATTGATTAATTAATTAATTTATAATTAATAATTATTAATATAGTCCGGCCCGCCCCCGC -GGGGCGGACCCCGAAGGAGTCCGGCCGAAGGAGTTTATTATATTATATTAAATAAGATTT -ATAATATAATTAATATATATTTTAATAAATATAAAAGATTATATTATATTATAAAAAGTA -TATTTTATATTTATATTTTATTTATTATTATTATTATATATATAAGTAGTAAAAAGTAGA -ATAATAGATTTGAAATATTTATTATATAGATTTAAAGAGATAATCATGGAGTATAATAAT -TAAATTTAATAAATTTAATATAACTATTAATAGAATTAGGTTACTAATAAATTAATAACA -ATTAATTTTAAAACCTAAAGGTAAACCTTTATATTAATAATGTTATTTTTTATTATTTTT -ATAATAAGAATAATTATTAATAATAATAAACTAAGTGAACTGAAACATCTAAGTAACTTA -AGGATAAGAAATCAACAGAGATATTATGAGTATTGGTGAGAGAAAATAATAAAGGTCTAA -TAAGTATTATGTGAAAAAAATGTAAGAAAATAGGATAACAAATTCTAAGACTAAATACTA -TTAATAAGTATAGTAAGTACCGTAAGGGAAAGTATGAAAATGATTATTTTATAAGCAATC -ATGAATATATTATATTATATTAATGATGTACCTTTTGTATAATGGGTCAGCAAGTAATTA -ATATTAGTAAAACAATAAGTTATAAATAAATAGAATAATATATATATATAAAAAAATATA -TTAAAATATTTAATTAATATTAATTGACCCGAAAGCAAACGATCTAACTATGATAAGATG -GATAAACGATCGAACAGGTTGATGTTGCAATATCATCTGATTAATTGTGGTTAGTAGTGA -AAGACAAATCTGGTTTGCAGATAGCTGGTTTTCTATGAAATATATGTAAGTATAGCCTTT -ATAAATAATAATTATTATATAATATTATATTAATATTATATAAAGAATGGTACAGCAATT -AATATATATTAGGGAACTATTAAAGTTTTATTAATAATATTAAATCTCGAAATATTTAAT -TATATATAATAAAGAGTCAGATTATGTGCGATAAGGTAAATAATCTAAAGGGAAACAGCC -CAGATTAAGATATAAAGTTCCTAATAAATAATAAGTGAAATAAATATTAAAATATTATAA -TATAATCAGTTAATGGGTTTGACAATAACCATTTTTTAATGAACATGTAACAATGCACTG -ATTTATAATAAATAAAAAAAAATAATATTTAAAATCAAATATATATATATTTGTTAATAG -ATAATATACGGATCTTAATAATAAGAATTATTTAATTCCTAATATGGAATATTATATTTT -TATAATAAAAATATAAATACTGAATATCTAAATATTATTATTACTTTTTTTTTAATAATA -ATAATATGGTAATAGAACATTTAATGATAATATATATTAGTTATTAATTAATATATGTAT -TAATTAAATAGAGAATGCTGACATGAGTAACGAAAAAAAGGTATAAACCTTTTCACCTAA -AACATAAGGTTTAACTATAAAAGTACGGCCCCTAATTAAATTAATAAGAATATAAATATA -TTTAAGATGGGATAATCTATATTAATAAAAATTTATCTTAAAATATATATATTATTAATA -ATTATATTAATTAATTAATAATATATATAATTATATTATATATTATATATTTTTTATATA -ATATAAACTAATAAAGATCAGGAAATAATTAATGTATACCGTAATGTAGACCGACTCAGG -TATGTAAGTAGAGAATATGAAGGTGAATTAGATAATTAAAGGGAAGGAACTCGGCAAAGA -TAGCTCATAAGTTAGTCAATAAAGAGTAATAAGAACAAAGTTGTACAACTGTTTACTAAA -AACACCGCACTTTGCAGAAACGATAAGTTTAAGTATAAGGTGTGAACTCTGCTCCATGCT -TAATATATAAATAAAATTATTTAACGATAATTTAATTAAATTTAGGTAAATAGCAGCCTT -ATTATGAGGGTTATAATGTAGCGAAATTCCTTGGCCTATAATTGAGGTCCCGCATGAATG -ACGTAATGATACAACAACTGTCTCCCCTTTAAGCTAAGTGAAATTGAAATCGTAGTGAAG -ATGCTATGTACCTTCAGCAAGACGGAAAGACCCTATGCAGCTTTACTGTAATTAGATAGA -TCGAATTATTGTTTATTATATTCAGCATATTAAGTAATCCTATTATTAGGTAATCGTTTA -GATATTAATGAGATACTTATTATAATATAATGATAATTCTAATCTTATAAATAATTATTA -TTATTATTATTAATAATAATAATATGCTTTCAAGCATAGTGATAAAACATATTTATATGA -TAATCACTTTACTTAATAGATATAATTCTTAAGTAATATATAATATATATTTTATATATA -TTATATATAATATAAGAGACAATCTCTAATTGGTAGTTTTGATGGGGCGTCATTATCAGC -AAAAGTATCTGAATAAGTCCATAAATAAATATATAAAATTATTGAATAAAAAAAAAATAA -TATATATTATATATATTAATTATAAATTGAAATATGTTTATATAAATTTATATTTATTGA -ATATATTTTAGTAATAGATAAAAATATGTACAGTAAAATTGTAAGGAAAACAATAATAAC -TTTCTCCTCTCTCGGTGGGGGTTCACACCTATTTTTAATAGGTGTGAACCCCTCTTCGGG -GTTCCGGTTCCCTTTCGGGTCCCGGAACTTAAATAAAAATGGAAAGAATTAAATTAATAT -AATGGTATAACTGTGCGATAATTGTAACACAAACGAGTGAAACAAGTACGTAAGTATGGC -ATAATGAACAAATAACACTGATTGTAAAGGTTATTGATAACGAATAAAAGTTACGCTAGG -GATAATTTACCCCCTTGTCCCATTATATTGAAAAATATAATTATTCAATTAATTATTTAA -TTGAAGTAAATTGGGTGAATTGCTTAGATATCCATATAGATAAAAATAATGGACAATAAG -CAGCGAAGCTTATAACAACTTTCATATATGTATATATACGGTTATAAGAACGTTCAACGA -CTAGATGATGAGTGGAGTTAACAATAATTCATCCACGAGCGCCCAATGTCGAATAAATAA -AATATTAAATAAATATCAAAGGATATATAAAGATTTTTAATAAATCAAAAAATAAAATAA -AATGAAAAATATTAAAAAAAATCAAGTAATAAATTTAGGACCTAATTCTAAATTATTAAA -AGAATATAAATCACAATTAATTGAATTAAATATTGAACAATTTGAAGCAGGTATTGGTTT -AATTTTAGGAGATGCTTATATTCGTAGTCGTGATGAAGGTAAACTATATTGTATGCAATT -TGAGTGAAAAAATAAGGCATACATGGATCATGTATGTTTATTATATGATCAATGAGTATT -ATCACCTCCTCATAAAAAAGAAAGAGTTAATCATTTAGGTAATTTAGTAATTACCTGAGG -AGCTCAAACTTTTAAACATCAAGCTTTTAATAAATTAGCTAACTTATTTATTGTAAATAA -TAAAAAACTTATTCCTAATAATTTAGTTGAAAATTATTTAACACCTATAAGTTTAGCATA -TTGATTTATAGATGATGGAGGTAAATGAGATTATAATAAAAATTCTCTTAATAAAAGTAT -TGTATTAAATACACAAAGTTTTACTTTTGAAGAAGTAGAATATTTAGTTAAAGGTTTAAG -AAATAAATTTCAATTAAATTGTTATGTTAAAATTAATAAAAATAAACCAATTATTTATAT -TGATTCTATAAGTTATTTAATTTTTTATAATTTAATTAAACCTTATTTAATTCCTCAAAT -GATATATAAATTACCTAATACTATTTCATCCGAAACTTTTTTAAAATAATATTCTTATTT -TTATTTTATGATATATTTCATAAATATTTATTTATATTAAATTTTATTTGATAATGATAT -AGTCTGAACAATATAGTAATATATTGAAGTAATTATTTAAATGTAATTACGATAACAAAA -AATTTGAACAGGGTAATATAGCGAAAGAGTAGATATTGTAAGCTATGTTTGCCACCTCGA -TGTCGACTCAACATTTCCTCTTGGTTGTAAAAGCTAAGAAGGGTTTGACTGTTCGTCAAT -TAAAATGTTACGTGAGTTGGGTTAAATACGATGTGAATCAGTATGGTTCCTATCTGCTGA -AGGAAATATTATCAAATTAAATCTCATTATTAGTACGCAAGGACCATAATGAATCAACCC -ATGGTGTATCTATTGATAATAATATAATATATTTAATAAAAATAATACTTTATTAATATA -TTATCTATATTAGTTTATATTTTAATTATATATTATCATAGTAGATAAGCTAAGTTGATA -ATAAATAAATATTGAATACATATTAAATATGAAGTTGTTTTAATAAGATAATTAATCTGA -TAATTTTATACTAAAATTAATAATTATAGGTTTTATATATTATTTATAAATAAATATATT -ATAATAATAATAATTATTATTATTAATAAAAAATATTAATTATAATATTAATAAAATACT -AATTTATCAGTTATCTATATAATATCTAATCTATTATTCTATATACTTATTACTCCTTTT -TAATTAAATTAAAAAGGGGTTCGGTTCCCCCCCCCCATAAGTATGATTATAATTATAATT -ATAATATAAGGGAGGGGTCCCTCACTCCTTATGGGGTCCCGGTTGGACCGAGACTCCTCC -CTTGCGGGATTGGTTCACACCTTTATAAATAAATAATAAATAATAAATAAAGGTGTTCAC -TAATAAATATATATATATATATATATATATTATATTATAATATTATTTAATACTTAATAT -ATTATATATTTTATATTTAATAAATAAAAAAAATATTAATAAATAATAATATTAATAATA -AAGAAATTATAATTAATACCCTCTATATATAATTCTAATTAATTAAATTAAATATTTATA -TATAATAATCAATATATTATTAATTTAATAATTATTATAATAGTTCCGGGGCCCGGCCAC -GGGAGCCGGAACCCCGAAAGGAGTTTATAAAAGATATATTTTTATATTATATTATATTAT -ATTTAATAAATATTACCTTTTTTTATTATTATTTTTATATATTATATAATATTATTAATT -TTTATTATAATATTATTTACTTTTTTATTGGATTATTTATTTATTTATTTATTTATTAAT -TAATTAATTAAATATTTATTAATTAATATATATATTAAATATTAATATTTCATTAAAAAA -AAGAGATATATGAATAATATATTATGTTATATTATATTATATAATTATATTATTTTTATA -ATATTAATAATTAAAAATAAGAACTTATTTAAAAATTATAATTATGATAATAAATTAATA -CTTTTTAATTTATAAAAATATAAATTTCTTTACATATATATATATATATATTATTATTAT -TTATATTAATCATAATTTTAATATTTATAATAAATTTATATAAAATCAATTATAATATTA -TATACTTTTTATATACTTTATAATCTTTATATCTTCACCCCCCCTTTTTTAATAATATAT -TATATTAAAAATATAATAATTTATATGATTTATTAATACTTTTTATATAATTATATTATT -ATTTTTTTTTATAGATGTTATATTATTTTTTATAATAATTTTTTTTTATTTAAATAAAAT -TTATAACTCCTTCTTAATTAAAGATAAAAGGGGTTCCCCCCTTAAGTATAAGTATAAGTA -TAAGTATAAGTATAAGTATAAGTATAAGTATAAGTATAAGTATAAGTATAGTATACGGGG -GGGGGGTCCCTCACTCCTTCGTTAATTTATATATATTATTAATAATTATTTAATTTTTAT -TATTTATTTATATATAAAAATATTCTAAAATTATTAATATTTATAATAGAATAAATATTA -TAAAGTATAATTATAAATAATTAATTATTTAAATAATAATAATATATTTATTATTATATA -ATAAATATATTATAAATAATAGTTATATTAGCTTAATTGGTAGAGCATTCGTTTTGTAAT -CGAAAGGTTTGGGGTTCAAATCCCTAATATAACAATAATAATAATAAAATATTAAAATAA -ATATAATATTTATAAAAAATTTATTAATTTATATAAAAAATATATATATAAATAATAATT -ATAATAAAACATTTTATAATCAATAATTTAATAAATAATCTTCTTATTATAATATTATGT -TTAAATATTACTCTTTATGAGGTCCAACAAACTAATAAGATATAAATATATATATATTAT -ATAATAATAATAATAATAATATATTATTTAATATATTATCAAGAAGATAAATATAAATAA -TATTTTAATAATTTTAAATAAATCTAATTTATATATTAATAATTTAATAATCTTAATATT -TATTATCATTATTTCATATTTATATTATATAAATATTTATTTAAATAAAAAATATTAAAG -AGTTTATTTTATTTATTATAAATTATTTAATAAAATATATATAATAATATATAGAATAAA -GATATAAATAATTATAAGTATATAAAGTAATAAAGGAGATGTTGTTTTAAGGTTAAACTA -TTAGATTGCAAATCTACTTATTAAGAGTTCGATTCTCTTCATCTCTTAAATAAATAATAT -AATAATAAAATATTATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGGAAGGAGAT -AAATATATATATATTTATAATAATTATATAATAAAGGTGAATATATTTCAATGGTAGAAA -ATACGCTTGTGGTGCGTTAAATCTGAGTTCGATTCTCAGTATTCACCCTATAAATAATAA -TAATAATATATTTTATTATTCTTAAATTTTTTATTCTTTATATTATATATATAATATTAA -TATTATTACTTTTTAATAACAAAATATTATAATTAATTGATATATATATATACCAAATAT -AATTAATTGAAATTAAATAATAAATAAAATATTTACTTCTTTATTAAAATTCTAATTAAT -TGATTCTTTTTATTGAATATTAAATTCTATTATAACTTATTAATTAATTAATTAATTAAT -TATAATAATAATAATATTTATTATTAATTATTAAATATTTATTATTATATATAAGATTTA -ATTTTAAATATTAATAAAAAAAGAATAAAATAAAATAAAATGAATAATATTTCTTTATCT -CTTTCGATCGGACTCCTTCGGCCGGACTCCTTCGGGGTCCGCCCCGCGGGGCGGGCCGGA -CTATTTATTATTATAATATAATATTTAATCAATAGATTTATAATTTATTTAATGAATATT -TTATAAATATATAAAACAATTCCTTTTTATTATTATAAATTTTTCATTATTTATTTATTT -ATTTATTTATTTATTCAATATATAAAAATAATTATAAAAAGATTATTAAAAATAATAATT -TAATGATAAATATATATTATATATATTAATATAAAAATAATAAATATAAATATATTATGT -AAATATTATATAAATTTGTATATGTATATATTATAATAATGTTATATAAGTAATAATATA -ATAAAATATTTTATGTAATTTATATATATTTATAATTATAAAATAAAAATATTATAAATA -ATAAAATTAATAATAATAATAATTTTAATAAAATAAATTATATATTTAATTTTATTATGA -AGTTTATACTTAATATAAATTATATTTCCTTTATAAATTATTAATATATCCTTTTTAATT -AAATAAAATAAAAATATTATAAATATTAATAATTAATTTTTTATTTATATTTATATATAT -ATTAAAGATTAAATATATTATTAATACTAATTTATAATTTATTATTAATAAATAGTCCGG -CCCGCCCCCTGCGGGGCGGACCCCGAAGGAGTTCGACTTAAATTATAATTTAATAATTTT -TATTTATTAATAGTTTCGGGGCCCGGCCACGGGAGTCGGAACCCCGAAAGGAGTTTTATT -ATTAATATAAAAAGAGTAAGGATAATAATAAATTCTTTTAATTTATTTTTAATAAAATAT -AATTTTAAAATAGTTTTTATAGTCCGGCCCGCCCCGCGGGGGGGGGCGGACCCCGAAGGA -GTTCGGTCTGGCATTAATTATAATAATTATATTAATATTATTATTATTTATTATATTATA -ATATATTTATTTATATTTTATAATATTAATAATTATTTTATATTTAATAAATATAATATA -TATATTATTTTTTTTAATAACTATCTAATTAATAGCTATTTTGGTGGAATTGGTAGACAC -GATACTCTTAAGATGTATTACTTTACAGTATGAAGGTTCAAGTCCTTTAAATAGCAATAA -ATATATATAATATATATAATATATATAAATGAGTCGTAGACTAATAGGTAAGTTACCAAA -ATTTGAGTTTGGAGTTTGTTTGTTCGAATCAAACCGATTCAATATTATAATATATATATT -ATTTATATATAAATATATAATTATACTCCTATTTTTATATTAATTAATTAATAATATATG -ATAATATAAAAATTATTGAATTATTAACTCTTATTAATAATAATAATAATCATAATAATA -ATATATATATATATAGTATATATATAAAAGTTTTATTATATTATATTATATTATATATTT -ATTTATATATAATTCTTATTAATTGAAAAAAGAATAATTAATAATCTTATTAAAAAAATA -AATACTTTCATTTTATTTTATTTTATTTAATTTAATTATAATATATAAATATTAAAAAAA -GGATATAAGTTTTTTATAAGATATAATATATATATATATTAAATATAAAGAAGTTAATAT -TTATATTTTAATTATAAAATGTTAATACTCCTTTGGGGACTTATTAATTAAATTATTAAT -TAATAATAATTTATGATTTATAAATAATAAATAAAGGAATAAGTATCAATTAATTAATAT -ATTATATTTAATATTTTATATTTAATATTTAATATTTAATATTTTAAGTTCCGGGGCCCG -GCCACGGGAGCCGGAACCCCGAAAGGAGTAGTATTAATTATGGATAGTGAGGGTGGATTT -AATCCTTTTGTTATGTTATTAATTAATTAATTAATTTATATATATAAAATATTTTAATTA -ATTTTTATATAAATATATATATATATATATATTAATAATAGTCCGGCCCGCCCCGTGGGG -CGGACCCCAAAGGAGTAATATATATTATGTATAAACAATAGAGAATATTGTTTAATGGTA -AAACAGTTGTCTTTTAAGCAACCCATGCTTGGTTCAACTCCAGCTATTCTCATAATATTA -TATATATATATTTCCCTTTCTAAAAATAATAATAATTATATATAATAATAATATAATTAT -ATATATATATATTATAATAATAATAATAATAATAATAATAAATAATAATAATTATTTTTA -TTAATAATATTAATATATTATAATTATTAATAAATATTAATAAAAATAGCTCTCTTAGCT -TAATGGTTAAAGCATAATACTTCTAATATTAATATTCCATGTTCAAATCATGGAGAGAGT -AATTATATTATATTAATAATCCCCCCCCCATTTTTAATTAAATTAAGAAGTTTAATTTAC -TATTTAATAATAAATGAAATAATAATAATAGATATAAGTTAATTGGTAAACTGGATGTCT -TCCAAACATTGAATGCGAGTTCGATTCTCGCTATCTATAATTAATATTAATATAAATTAA -TATCCTATAATTAATTAAATACAAAATTATATTAAAACTTATATTATATTATATTATAAT -ATTATATTATTATTATATAAAAATATAATAATAATAATATTTAATTTTATTTAATAATAA -TATTTTATATAATAAAATAATCATATTTATAATATTTAATATTAATAATAATTTATTATA -ATAATTCTTTAATATACTTATTTATTATTATTTTAATAAATAAATATAATTCTTATAAAT -ATATTATAACAAAATATATTATATTTTAATTAAATACAATATTATAAATATATATATATA -TATAAATATTTATATAAAAAAAAAATAAAAATATTTTAATAATTATTCTTTATAAATAAA -TAATGATAATAATAATTTATAATAATCTCCTTGTGGGGTTCCGGCTCCCGTGGCCGGGCC -CCGGAACTATAATATATTTTAATATATTTTTTATTACTCCTCCTTTGGGGTCCGCCCCGC -GGGGGCGGGGCGGACTATAATAATTTTTTATTGATAAAAAAGTATATATAATATAATTAA -TATATTTCTTTTTATATAAATTATAAATATTATTTTATAATAAAAAAAGTATATATAATA -TTATATATTTAATAAATAATATAATAATAATATAAATAAATATATATATATTATTAATAT -ATTAAATTTTATAATAATAATTATAATAATAGTAGTAGGTATAAATTTTAATAAAGAGTT -TTATTCCAATGGAGTAATAATAATAATAATAATAAAATAAAGGATCTGTAGCTTAATAGT -AAAGTACCATTTTGTCATAATGGAGGATGTCAGTGCAAATCTGATTAGATTCGTATATTT -ATACTTAATATAAAAAAAATAAATAATAATCTTTTTTATTATTATATTTATTAATAATAA -ATTATTTTGTTATTATTATTAATTTATATTAATATTTTATATAAATTATTTATTTAATCT -TTCATTATATATTTAATATATTATTAATATTAATTAATATTTTATAATAAATAAATAAAA -TAAAATAAATATTTTAATATAATACTCCTTCGGGGTTCGGTCCCCCTCCCATTAGTATAG -TATAGGGAGGGGTCCCTCACTCCTTCGGGGTCCCCGCCGGGGCGGGGACTTATTTTTATA -TTTATTAATAATAATTAATTTTTATATAAATTTATTATTTCTTACAATATATTTATTACT -ATTATTTTTTAATAATCTTATATATAATATATAAAATATATATATATTATATATATATAT -AAATATAATATATATTATTATAAATATTTATAATCTTATTAATTAATTAGATTATATTAT -ATTATATTAGATCATATTATATTATATTATATTATATTATATTATTATTATTAATATTTT -TATTTTTATTTTATATTTAATAGTAAAAAATCATAATTTTATAATTTATTAATTATTATA -TAATTTCATTAATATATTTCTTCTTTTTATTTATTTATTTATTACTTATTAATAGTTCCG -GGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAAAATAATATAAAAAATAATTATAATT -TATTATAATTTATTAATTTATTAATTTATTAATTTATTTATTAATTTATTAATTTATTTA -TTATTATATTTTTTTTAATAAAGGAAAATTAACTATAGGTAAAGTGGATTATTTGCTAAG -TAATTGAATTGTAAATTCTTATGAGTTCGAATCTCATATTTTCCGTATATATCTTTAATT -TAATGGTAAAATATTAGAATACGAATCTAATTATATAGGTTCAAATCCTATAAGATATTA -TATTATATTATATAATATTATATATTAATAAATATTATTAATTAATTTATTTATTTATTT -ATTATTAAATAAAAATATTTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAA -AGGAGAATAATATAAAATATTATAATTATTTATATATTAATTATTAATTATTTATTATTT -ATTATATAAAAAGTATATAATTTTATATTTTAATATAGGGTTAATTAATTAATTATTAAT -TTTTTATAATAAGATAATAATATATTAAAAACTTATTATAAATTTATAAAATAATATTTA -TTTACTTTGATATTATTTTTAATCTTTCATTAATATATATTTTATTATAAGTAATAATAT -AGTTTAATTTAATTAATATAAATAAATTACATAAGAATAATATTATAATAATATTATATA -TTATATAAAGAAATAATAATTTATATTTTTATTTTTTTTATAAATAATATAAATATAAAT -ATAATGGGGTTATAGTTAAATTTGGTAGAACGACTGCGTTGCATGCATTTAATATGAGTT -CAAGTCTCATTAACTCCAATAATTATATTATATAATATATATATTAATAAATTATATATA -TATATATATATATAAATATTAAATAAATATTATATTAATAAATAATATAAATTATCTAAT -CGAAGGAGATATTTATAATATAATATAAATATTTTAATAAATTAATAAATATTATATTAA -TAAATAATTAATAAATATATAAATTATAATAAATTTTAATATTATTATATAAATTAATTA -AATATAATAATTAATGAAATAGAAACTATAATTCAATTGGTTAGAATAGTATTTTGATAA -GGTACAAATATAGGTTCAATCCCTGTTAGTTTCATATTATATATCATTAATATATAAAAT -ATAAATATATATATTATAATAATAATAATAATAAATATAAATATAATTATATATATATAT -ATATATAAATAAATAATTATTTAATTTATAATAAATATATATAGTTCCCGCGAAGCGGGA -ACCCCATAAGGAGTTTTATTATTAATTATATTTAATAAATATTAATTATTAATTTTATAT -TTATAAATAAATTTATTACTCCTTCTTAATTAAGAATAAAAAGGGATGCGGTTCCCATGG -GGTCCCGCACTCCTTCGGGGTCCGCCCCCTCCCCTGCGGGAGGGGAGCGGACTATTTTAT -TAAAAATATTATAATTAAATAATAATATAAATAATTTATAATATAATAATATATACTTAT -AAATAATATTTAAATCTTATTATTAATTTATAAATCATAAATTATTATTAATAAATATCT -CTTTTAGATAAGATAAATTGAACTTATATTTATATTATATATATATAGATATAAATCTTA -AATAGAGTAAATATATTATAATAATTATATAAATATATATATATTATATTAAGATAATAA -TATATATATATATTAATATATAAGGAGGGATTTTCAATGTTGGTAGTTGGAGTTGAGCTG -TAAACTCAATGACTTAGGTCTTCATAGGTTCAATTCCTATTCCCTTCATAATTTATTATT -AATTATATATTATTATAAATCAAATCCATTGAAATTAATATAATCCAATGAATAATTAAT -TTAATACATAATTTAATATATAAAATTATATATATATATACTTTATAAAAAAAAAAATTA -TATAATAATTATATTAATATATTTATATATATAAATAAATAAATAAATAATAATAATTAT -AATTATAATTATAATTAATTAATTAATAAATAAATAATAATTTATATTATCTTTATAATA -TATATATATACTTTTATAAAAAAAATATATAAATAATTCTAAAATGTATATTTCTCCTTT -CGGGGTTCCGGCTCCCGTGGCCGGGCCCCGGAACTATTAATAAAATTAATAATAAAATAA -TTATTATCTGTATTTAATAAATTTAATTATAGAGTTATATTTCTATATATTTATATATTT -ATTTATTTATTCTCCTTCCGGAACTAATAAAATATATAAAATAAGGGTTTTTATTTATTT -AATTAATATATATTTATTCTTTTATATAATATGTCCTTATAGCTTATCGGTTAAAGCATC -TCACTGTTAATGAGAATAGATGGGTTCAATTCCTATTAAGGACGATAATAATATATATAT -ATTTTAATTTATATATCATATATATATATATATTAAAGAAAATAATATAAAAAGTATGTA -TTAATAATAATAATAAATAAATAATAATAAATAATTTTATTATATTATATTATATTATAT -TTATTGATATATTTATTGATATTTATTAATTTAAGATTATTCATTAAATATATAATTATT -AATAATTTAATATATTTTATAATTTTTATTATATTTTATGTAAGAAGAAACTATTTTATA -TATTATATATATATATATAATTTTTATAAAATGATAAATTTTATATTATAAATATTATTA -AAATATTTTTATAAATATTTAAATTATTTATAAAAAGGTATATAATAATAATTATTAATA -TTATATTATATTATATATTTATTTATATTATATATAATAATATATTTATATATATATTAA -TTAATAAATTAAATAAGTATCTATATTTTATATTATATTATATTATTTTATTTTATTAAT -TCCGGAAGGAGAATAAAAAGTATTCTAAAGAAATTATATATTTATTATTTTTATTAATAT -GTTATAAATTAATAAAAAATAAATATGTATATATAAATTATATTTATTATGTTTAATTAT -TTATAATTTATTATAATATATAGTATAAGATATCTTATTTATATTTATATATAATAAAGA -ATATTATTAAACTAACACCTATATTATATATATTATATTATATAATATTATATATATATT -AATTACTAAGAATAAATTTATAATTAGATAATATTTATATTTATTTATTTATTTAATTAA -CAAATATATTAATATTTTTAATTAATTAATAATACCTTTATATATATATATATATATATA -TTAATTTTAATTATATAATTATCTTTTTTATTAATAATTATAAATATATTATATATTTTA -TATAATAAGATTATAATTTTATAATTATTTTATTTTTTATTAAAAATTATTATTATTATA -ATTATTATATTATAATTATAAATTATTAAAGAATATATTTATTAATATTTTAATAATTAA -TATCTTTTATTTATATTTATAAAATAAGGTATAAATATTGATAATAAAGAGTAAATATTG -TATTAATTATAATAATAATTATAATTAAGGAGCTTGTATAGTTTAATTGGTTAAAACATT -TGTCTCATAAATAAATAATGTAAGGTTCAATTCCTTCTACAAGTAATAATGATTATAATA -TTTATATATATTAAAATAATATTAATAAATAATTACTCCTCCTAGCAGGATTCACATCTC -CTTCGGCCGGACTCCTTCGGGGTCCGCCCCGCGGGGGCGGGCCGGACTATTTTATTATTA -TTAAATAGATGTTCATTAAATAATTATAAATATAATTTATCTTTTAAATATATATATATA -ATATAATATTTAAATATATATTATAAATAAATAAATAAATAATTAATTAATAAAAACATA -TAATGTATATTTATCTATAAAAAATATTAATTAAATTAATATATTATTACAGTTCCGGGG -GCCGGCCACGGGAGCCGGAACCCCGAAGGAGATAAATAAATAAATAAATATAAATAATTC -TTCTTCTTTAAAATTAAATAAAATAAAATAAAAAGGGGGGCGGACTCCTTCGGGGTCCCG -CCCCCCTCCGCGGGGCGGACTATTTTATTTTTAAATATATATTATATTAATAATATAAAT -ATAAGTCCCCGCCCCGGCGGGGACCCCGAAGGAGTATAAATAAAAATTAATAATATATTA -TATATATATTATATTAATAATAATAATAATAATAATAATAATAAATAATAACTCCTTGCT -TCATACCTTTATAAATAAGGTAATCACTAATATATTATAATAATAAAAATTATATATATT -ATATATAATCTAAATATTATATATTTTAATAAATATTAATATATATGATATGAATATTAT -TAGTTTTTGGGAAGCGGGAATCCCGTAAGGAGTGAGGGACCCCTCCCTAACGGGAGGAGG -ACCGAAGGAGTTTTAGTATTTTTTTTTTTTTAATAAAATATATATTTATATGATTAATAA -TATTATATATATTATTTATAAAAATAATATATAATTTTAATTATTTTTAATAAAAAAAGG -TGGGGTTGATAATATAATATAATATTTTTTATTTTAATTTATAATATATAATAATAAATT -ATAAATAAATTTTAATTAAAAGTAGTATTAACATATTATAAATAGACAAAAGAGTCTAAA -GGTTAAGATTTATTAAAATGTTAGATTTATTAAGATTACAATTAACAACATTCATTATGA -ATGATGTACCAACACCTTATGCATGTTATTTTCAGGATTCAGCAACACCAAATCAAGAAG -GTATTTTAGAATTACATGATAATATTATGTTTTATTTATTAGTTATTTTAGGTTTAGTAT -CTTGAATGTTATATACAATTGTTATAACATATTCAAAAAATCCTATTGCATATAAATATA -TTAAACATGGACAAACTATTGAAGTTATTTGAACAATTTTTCCAGCTGTAATTTTATTAA -TTATTGCTTTTCCTTCATTTATTTTATTATATTTATGTGATGAAGTTATTTCACCAGCTA -TAACTATTAAAGCTATTGGATATCAATGATATTGAAAATATGAATATTCAGATTTTATTA -ATGATAGTGGTGAAACTGTTGAATTTGAATCATATGTTATTCCTGATGAATTATTAGAAG -AAGGTCAATTAAGATTATTAGATACTGATACTTCTATAGTTGTACCTGTAGATACACATA -TTAGATTCGTTGTAACAGCTGCTGATGTTATTCATGATTTTGCTATTCCAAGTTTAGGTA -TTAAAGTTGATGCTACTCCTGGTAGATTAAATCAAGTTTCTGCTTTAATTCAAAGAGAAG -GTGTCTTCTATGGAGCATGTTCTGAGTTGTGTGGGACAGGTCATGCAAATATGCCAATTA -AGATCGAAGCAGTATCATTACCTAAATTTTTGGAATGATTAAATGAACAATAATTAATAT -TTACTTATTATTAATATTTTTAATTATTAAAAATAATAATAATAATAATAATTATAATAA -TATTCTTAAATATAATAAAGATATAGATTTATATTCTATTCAATCACCTTATATTAAAAA -TATAAATATTATTAAAAGAGGTTATCATACTTCTTTAAATAATAAATTAATTATTGTTCA -AAAAGATAATAAAAATAATAATAAGAATAATTTAGAAATAGATAATTTTTATAAATGATT -AGTAGGATTTACAGATGGAGATGGTAGTTTTTATATTAAATTAAATGATAAAAAATATTT -AAGATTTTTTTATGGTTTTAGAATACATATTGATGATAAAGCATGTTTAGAAAAGATTAG -AAATATATTAAATATACCTTCTAATTTTGAAGAACTACTTAAAACAATTATATTAGTAAA -TTCACAAAAGAAATGGTTATATTCTAATATTGTAACTATTTTTGATAAGTATCCTTGTTT -AACAATTAAATATTATAGTTATTATAAATGAAAAATAGCTATAATTAATAATTTAAATGG -TATATCTTATAATAATAAAGATTTATTAAATATTAAAAATACAATTAATAATTATGAAGT -TATACCTAATTTAAAAATTCCATATGATAAAATAAATGATTATTGAATTTTAGGTTTTAT -TGAAGCTGAAGGTTCATTTGATCTATCTCCAAAACGTAATATTTGTGGTTTTAATGTTTC -ACAACATAAACGTAGTATTAATACATTAAAAGCTATTAAATCTTATGTATTAAATAATTG -AAAACCAATTGATAATACACCATTATTAATTAAAAATAAATTATTAAAAGATTGAGATTC -ATCTATTAAATTAACTAAACCTGATAAAAATGGAGTTATTAAATTAGAATTTAATAGAAT -AGATTTTTTATATTATGTTATTTTACCTAAATTATATTCATTAAAATGATATAGTCGTAA -AGAAATTGATTTCCAATTATGAAAAACACTTATAGAAATCTATATAAAAGGTTTACATAA -TACACTTAAAGGTTCTAATTTATTAAAATTAATTAATAATAATATTAATAAAAAAAGATA -TTATTCTAATTATAATATTTCTCCTTTCGGGGTTCCGGCTCCCGTGGCCGGGCCCCGGAA -CTAAAAATATTATTGATGATGTATTAAATATAAATCTTATCTATAATTATAAATTACCAT -ATCGTATAAATAGTGATATTCAACGTTTAAATTCTATAAATAATAATAATACTAAATTTA -TTAATGTTGGAGTATTTGTTTATGATTTAAATAATACATTAATTATAACATTTACTGGTT -ATAGACCAGCAGCTCTTTACTTTAATTGTTCTCCTTTTCGGGGTCCCGACTGGGGCCGGG -ACTAAACATGAAATTGCTAAATATATTAAAAATGGTAATGTATTTATAAATAAATATATT -TTAAAAAATATTTTATTAGATTAATTATTATTTTTACTTCTTCTTAAAATTAAAAAAGGA -GACTTTTTTATATTTATATAAATTATATATAAATTATTCTTTTATTATAAATATATAAAA -TTATTTTCTTTTAATTATTTTTATAATTAATTAATTCTTCATGGCTATAGCCATAACTTT -TAATAATATTCTTTTATTCTTTATTATTATATATATATATATTTATTATTTATTATTATA -GAATTTATATTTATAAAAATATTAATATTTTATTTAAAATAAATAATGATTAATTTATAA -AATATATATTAATTAAGTTTCGGGTCCCGGCTACGGGACCCGGAACCCCCGAGAGGAGTT -ATTATATTTATAATTAAATCTTTAAATAATATATCTTAAATTATTATATTGATATTAATA -TTATATTGATATTAATATTAAATATATATTTAATATTTAGCTTATTATTTTATAAAATTA -TATTTATATATTATAATATAATTAAATATATTATAAATTTAATAATTTAATAAAAATATT -CTTTTTATAATTATTATAATAATTAAATAAATAATAATAATAAGAATAATTAATGTATAA -TTTTTTTATAAATATTATATATTTTTATATTAATAGTTCCGGGGCCCGGCCACGGGAGCC -GGAACCCCGAAAGGAGAAATATTAATAAAATAAAATAAAATTATAATATAATTAAATTAT -AAGAATTATATTTACTCCTTTTATAATTTATATTTATAATATAATATAATATAAAATAAA -TATAATATAATATAAAATAAATATAATGTAATAGGTATTCACTCCTCTTTGGGGTTCCGA -TCCCCCATACGGATACGGATACGGATACGAATACGGATACGGATACGGATACGGGGGGCC -GTCCCCCAGAACTTAATATTATATCTTAAATAATTAATATAAATATAATATATTATTTAA -TAATAATAATAAATAAATAAATAAATAAATAAATAAATTAAATAAATAATAATATTATTA -TAATTACTTTTTAATAAATAATATTAATATAATATTATATTAGTATTATAAATAGACTTT -TTATTATTTTATATATAATATAGTCCGGCCCGCCCCCGCGGGGCGGACCCCGAAGGAGTA -ATATATTATATAATTATTATTTTTAATTATAAATAAAATATAATTATTATTTATTATATA -ATTTATATAAATATATATATATATTTATTATATATATAAATATAAATATAAATATAATAA -TTAATAATATTAAAGTTTTATATATATTAATATATTATAAAAGGTTTATATATATATATA -ATAAGATAAGTAATAAATTAATTAATTAATAATATAAAAATATATATTATATATTATGTT -TTATTTATATATATATATATATTATGTATTATTATATAAATATATATATATATTATATTA -TAAGTAATAATAAGTATTATATTATATATAGCTTTTATAGCTTAGTGGTAAAGCGATAAA -TTGAAGATTTATTTACATGTAGTTCGATTCTCATTAAGGGCAATAATAATAATATATTAA -TTAATAATTAATTTATAATAAATATATTATAATAATTAATATATATATATATAATATATT -TAATACAAAGAAAATATATATTATATCTCTTATTTATTTATTTATTAATATTTTAATAAA -TATAATATTATAAAAAAAAGTTTATATATTTAGTTCCGGGGCCCGGCCACGGGAGCCGGA -ACCCCGGTAGGAGAAATATAAATATAAATATAAATATAATATAAGTTTGGTATTCATTTA -ATTATATTATTTAATTAAAAATATTCTAAATAAGAATAAATATCAATAAAGGAGTTATAA -ATATATATATATATTAATATATATATAAAAATATATATTATTATTAGTTCCCGCTTTGCG -GGAACCCCGTAAGGAGTGAGGGACCCCATGGGAACCGAACCCCTATTTAAGAAGGAGTTT -TATTATAATAAAATTTATATATATTTAATATATAATTATAAAAATATTATATAATAAATA -ATAAATAATTATTAATAATAAATAAATATAATAATAATATTATAATAAATTTATAAATGA -TTATAATAAATTTATATTAATTTTTTATTTTGTAAATACTAAGATTTGAACTTAGATAAT -ATGCACCTAAAAACATACATTTTACCATTAAATTATATTTACCTTATTAATTATATAAAA -TTTATTAAATATATAATATATTAATTATATAAAAATTATTAAATAAATATATAATATATT -ATATATAATTTATAATATATATATTATAAATATTATTATATATAAAATATAATATACTAC -TTATAAAAATATATATATATATATAAATATATATATAAATAAATATTTTATATATTAAAT -TAAATAATTATTAATAAATTTAATTATAAAGTATAATTTTCAATAGGAATATTTATAAGA -TTATAATAATTATATGAATTATTATAATTATATATATATAAATAAATAAAATAATAATTA -TAATAATTAATAAGAGTTTTGGATATATATCTGTGGAGTATATATTTTATAAAGGAGATT -AGCTTAATTGGTATAGCATTCGTTTTACACACGAAAGATTATAGGTTCGAACCCTATATT -TCCTAAATCTAGATATAATATTATATCTATCTTAATATAATAATATTTATTTATTAAATA -AAAAAAAAATAAATAATATTAATTAATATAAGATTCTTTTTTAATTATAATAATAAATAA -ATAAAAAGAAGATATTATCAATGATTTATATTAATAATAAATATAAATAATAAAAAATAT -ATATAATATAATATAATAAATATATTTCCTTTAATATTAATAAATTAATAATAATAATAA -TAATAATAATAAAATATTTAAATAAATTATATTCAATACAAATTAATTATTTATATTATT -AATAATTGAATAAATAATCCGGTCGAAAGAGATATTAATTCGATTATATTATTTATTTAA -TTATATTTAATTTAAATATATAAATTAATATATATATATTGAATTATATATAAATTTATT -TTATAATTTTATAAATAATATATTATTATAAATATTTAATATAATTTATATTATTATTAA -ATAAAAGATTTATTAAATTAATATTATTATTTAATTTTATTATATAGTTTAAGGGATAAT -ATTTTATTAATATTTTTTTTATTTATTTATTTAATTATATTATATATATAATATATATAT -AACAATAAATTTATGACACATTTAGAAAGAAGTAGACATCAACAACATCCATTTCATATG -GTTATGCCTTCACCATGACCTATTGTAGTATCATTTGCATTATTATCATTAGCATTATCA -CTAGCATTAACAATGCATGGTTATATTGGTAATATGAATATGGTATATTTAGCATTATTT -GTATTATTAACAAGTTCTATTTTATGATTTAGAGATATTGTAGCTGAAGCTACATATTTA -GGTGATCATACTATAGCAGTAAGAAAAGGTATTAATTTAGGTTTCTTAATGTTTGTATTA -TCTGAAGTATTAATCTTTGCTGGTTTATTCTGAGCTTATTTCCATTCAGCTATGAGTCCT -GATGTACTATTAGGTGCATGTTGACCACCCGTAGGTATTGAAGCTGTACAACCTACCGAA -TTACCTTTATTAAATACTATTATCTTATTATCTTCTGGTGCTACTGTAACTTATAGTCAT -CATGCCTTAATCGCAGGTAATAGAAATAAAGCCTTATCAGGTTTATTAATTACATTCTGA -TTAATTGTTATTTTTGTTACTTGTCAATATATTGAATATACTAATGCTGCATTCACTATC -TCTGATGGTGTTTATGGTTCAGTATTCTATGCTGGTACAGGATTACATTTCTTACATATG -GTAATGTTAGCAGCTATGTTAGGTGTTAATTATTGAAGAATGAGAAATTATCATTTAACA -GCTGGACATCATGTTGGATATGAAACAACTATTATTTATCTACATGTTTTAGATGTTATC -TGATTATTTTTATACGTAGTCTTCTACTGATGAGGAGTCTAAGGCTATAGAATTATATAT -CTAAATGATTAATATATATATTATTAATAATTAACAATAATTAATATATTATAATTTATA -TATATATATTTTATATTATTATAATAATATTCTTACAAATATAATTATTATATATTATTC -CTTCAAAACTCCTAACGGGGTTCCCGCGAAGCGGGAACTAATAATAATATAATCATTATA -CTCTTTTTTCATTTACCTTTTATAAAGATAATTAATAAATTTATTTAATATTTATAAAAA -AAAAAATATAATATTAATATAATATAATATAATAATGTAATTATTTATATTTTTATATTC -CTTCGAGGTCACCGCCTCACCTCCAGCGGGACTTTTTTAATATGATATAATATAATATAA -ATATTATTAATTTAACTAATATATAAATTCATATATATATATATATTATTAATATTATTT -TATAAAAAATATTTTTTATTTGATTATTATTAAATATTATATAGTTCCGGGGCCCGGCCA -CGGGAGCCGGAACCCCGAAAGGAGAAATATTAATATATTATAAATATACTATTTATGTAA -TTATTTTTTGAAGTGAGCACCTATTTTATATATATTTTATATATATTTTATTATATTTTA -TTAAAAATAGGTGTGAACCTCCATGAGAGAGGAATGAATACCTATTTTATAAAGTATATT -TTATATTCTATATATTATAAATATGAACCAAAAAAAGGAGTTTAAAATTTAATTAAATTT -AATTAATTGAATTTCTTTATTATTATTATCATAATTATTAAACCCTTTATTAATATAATA -ATATATTATTTATTATCAAAATACCTACCCTTTTTATAATTTATATCTTTAATAATATAA -TTAAATATAAAATGTTTATTAAATATTATATAAAAATAAAAATAAAAATATATATATATA -TATAAATGATAAATAATAAGGAATTCACACTTATATAAATTTAAATATAAAGTCCCAAAA -GAAGTATTCATTAAATAAATTATCATTAATTAATTATAATAAACTTATTTAATATTATTA -AAGATTAATTTATAATAATAATTATTATTATTATTATTAATATTAATAAAATATATAAAT -AATTAAATAGTTCATATATTAAAAAGAATTAGAATTAAACTTTAATAAGTGTATTTAATA -TATAGAATATTAATAGAATATTTATTCTATTTATATATATATTTATATATATATATATTA -AATAATATTATTTATATTATATTTTATATATATATTATTAATATAAAAAGTATATTATAT -GTATTATATATATTATATATTATATATTTAATAATATATTACTCCTTTGGGGTGGGTCCG -CCCCACGGGGCGGGCCGGACTATTATAATTAATAATTTTATAAAGTTCCGGGGCCCGGCC -ACGGGAGCCGGAACCCCGAAAGGAGAATAAATAATTATATATCTTCTTCTTAATTAAATT -AAATTAAATTAAATTAAATTAAATTAAATTAAATTAAAAAGGGGTTCGGTCCCCCTCCCT -AACGGGAGGGGGTCCCTCACTCATTCAAACTATAATTTAATATATTATGATATTATTTAT -AATTTATAATATAATGTATAATATTATATTATAAATATTATATAAAAATAAAATGATATA -TATAATAATAATAATAATAATAATAAAAAAATAGAAAAGAATAATTTTTATTATTTTAGT -ATATATAAGAATTTAATAAGTTATATTATTGCGGACACCGTTACGCGGAGTGGGGACTAT -TATATTTTACCTATATATATTAATATTATTATAATTTCCTTCTTTAAAAGAAAAAAGGAA -TTCGAGAACTTATTATTATATTAATATATTAATAATAAATAATAATAAATAATAAAAAAG -TAAATAATTATAAATTATATAAAAATATAATTTTATTATTAAGAAAGGAGTTTAAATATA -AAATATAATATTATCATTAAGTTCTAATAAAGGTATATAATGAAGATCTATTAGAACCTA -AAAAGAATATTAATATATCTATTATAAAATAATAATAATAAATATAAATATAAAAATAAA -TTGTAATATTTATAAATAATAATAAAAAATAAATAAGGAATATATTAATTATTAATAATA -AATAAATTATATTAAAATATAATATTATTATTAAATTAAAGAATTATATTAAATATATTT -ATTAAAATTTTATAAATAAGTTAATATTTTATTAAATAATATTTATAAATAATAAAAAAA -AATAAGTATATAATTATTAATATATTAATTTATTATGTTATATATTTATATATTTCAAAT -ATATAAGTAATAGGGGGAGGGGGTGGGTGATAATAACCAGAATATTAAATAAATACAGAG -CACACATTTGTTAATATTTAATAATATAATCAATAAATATATTATAATAATATAATATAA -TTAATAATAGATATAAAGTATAAACAATATAATAAATTATATAAAATAAATATAAATTAA -AAATAATAACCAAATAATTAATATAATAAATGATAAACAAGAAGATATCCGGGTCCCAAT -AATAATTATTATTGAAAATAATAATTGGGACCCCCACAATAGAATAAAAAATAAAAAGAA -TTAATAATATATAAATAATATAAAATATATTATATATATATATAATATATATATATATAT -AATAAAAAAAAATATATAATATAATATATATATATAAAATAATAAATTATATATATATAT -ATAAAATAATAAAAAATAATAATCATATGAATTTTATAAATATAATTATTATTAATAATA -ATAATAATAATAATAAAGTCCGGTCCGCCCCGCGGAGGGGGCGGACCCCCGAAGGAGTGC -GGGACCCCGTGGGAACCGCATCCCTTTTTATTCTTAATTAAGAAGGAGATAATAATTTAT -AAAAATTAATATTTATTTTATGTAATATTAATATTAATATTAATATAATATAATATAATA -TAATACGGATTAAATATTACCAGTTGTTCACAGGTAATATAAAATCCTATTGTTTCACCT -ATTATTAATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGAAAGGAGAATAA -GTATATATAATAAAATTTAATAAAAAAAAATAATTATATAATAAATATATATATTATAAT -ATTATATAAATATAAAATATAATTGATATTAACATTATATAATTAATAATATAATCAAAT -AATATAAATATAATATAAAAAGTTTTAATTATTAAAATTATATAAATATTATTTAATAAA -AATAAAAATAATAATAATAATAATAATAATAAAGTCCGGTCCGCCCCCTCCGCGGAGGGG -GCGGACCCCGAAAGAGTGAGGGACCCCCCCGTATACTTACGGGGGGAGAACCGAACCCCT -TTTTTTATTTAAAGAAGGAGATAAATATTTATATCTTTATTTATAATTATATATAAATAA -AAGTTTATTAAAATTTATAATAATAATATAAAAAAGTATATAATAAATTTATTATAAATA -AATAAATATTTAGTAATAATATTTAATAAAATTATAAATATTATAAATAAAATATTAATA -ATAAATAATAAATATATAATATAATATAATATAATAAATTAATAACAATAAGATATCCGG -GTCCCCTAAATAATTATTATATAAAATAATAATTGGGACCCATACATATAAATATAAAAT -ATTTTAATATTTATATATAAATAATAATAATATATATTTATATTATATTATAATATAACC -CTTTCCAATTAATATTAATATTAATATTAATTACTTCCTTAAAAAAATAATAATTAATTA -ATTGATTTTTATATTAATATAAAAAAGTTAATATATATATTTATATATAAATAATATAAA -TTAATATAAAGATAATAAGTCCCCGCTTTCAGCGCAGTGAGGGACCCCCTCCCGTAAATA -TACGGGAGGGGAGACCGAACCCCAAAGGAATAATAAATAATAGTATGTATTTAAATAAAT -ATTTAATATACTATTTTTTTTTATTATTTTTATAATATATTTATAATAATATATTTAATT -ATAATTTATAAAAAAGAGATATAATATTTTATTATATATAATATTAATATAATACAAATT -AACATTATTTAATTATTATTAATAATATTTAACTTTATTATTATCTTCTACGGTTGGACT -CCTTCTTAAAAAGGGGTTCGGTCCCCCTCCCATTAGGGAGGGGTCCCTCACTCCTTCGGG -GTCCGCGCCCCCCGCGGGGGGGGGCGGACCGGACTATTATTACTATTTATTTATTAATAA -TAAATAATAAATTATAAAGTCACTGAAAGAGTGAGGAATTTTCCTTTTCCCAAGGGAAAA -CCCCAAAGGATAATATAAATATTATAAAATTTTTATTAAATAATATAAAATTCAATAAAA -TAATTTTAATTAATTAATTAATTAATTAATATAAAAATAAATATTTTTAATTAATATTAA -TATTAATAGTTCCGGGGCCCGGCCACGGGAGCCGGAACCCCGGAAGGAGAAATATAAATA -TAATAGTATAGTATATAGGAAGTTAATAATAATATAAATATTATATAATATATATATGTA -TATATATTATATTATATAATTAATTTTCTCCTTTTGTATTTACATCTTAATAAAATATAA -AATATAAAATGTTATTAACAATAAAAATTATTAATCTTTATAATATTAATAATAGTAAAT -TTATTTATATATCTCCTTTAGGATGGACTCCTTCGGCCGGACTCCTTCGGGGTCCGCCCC -GCGGGGGCGGGCCGGACTATTTTTATTTTTTTTTTAAAAAATATTAAATATTATAAATAT -ATTATAAATATATTATAAATATGTTATAAATATATTATAAATAGAATATAATATAATATT -ATATATTATAATGATAAAGATTATATATATTTTCTTTTTTTTTTTATTTATTATTTTTAA -TAAGTAAAAATTATATTATATATATATATATATTAGATTTTATAAGTAATATAATATAAG -TATTAATATATAAATGCAATATGATGTAATTGGTTAACATTTTAGGGTCATGACCTAATT -ATATACGTTCAAATCGTATTATTGCTAATAAATTAATATATAATATTTATAAAAAAGTAT -AATAAAATATATTATAAGAAGAATATATTATATAATAATTATATTAATAATATTAATAAA -TAATATATAAATAATTATAAAAAAGTATATAATATTAATCAATTAATTAATTAATAAATA -TAAATAATATATTAATTTTTAATTAATTTGAATAAGATATTTATATTATTAATAGGAAAG -TCATAAATATATAAATTATATTATATAATTAATATAATAATAAAATAAATTATATATTTT -ATTTATAATATTATTTCTTTATAAGATAAAATATTATCTGATGAATAATTAGATTGAATA -ATATTTATAAAGAAATATATATAAAAAGTCATTATATAAATTTAATTATAATTTAAATAA -ATTTTATATAAATTAATATAATATTAATAAAGTAATTAGTATAAATAAATAATATGAAAA -TAAAACTTAATAAATATATAAATATAGTCCGGCCCGCCCCCCCGCGGCGGGCGGACCCCG -AAGGAGTGAGGGACCCCTCCCTAATGGGAGGGGGACCGAACCCCTTTTTAAGAAGGAGTC -CATATATATATATTAATAAAAAAAAGTAATATATATATATATATTGGAATAGTTATATTA -TTATACAGAAATATGCTTAATTATAATATAATATCCATA diff --git a/srrTomat0/tests/artifacts/sc64_mito.gtf b/srrTomat0/tests/artifacts/sc64_mito.gtf deleted file mode 100644 index 98d6b11..0000000 --- a/srrTomat0/tests/artifacts/sc64_mito.gtf +++ /dev/null @@ -1,317 +0,0 @@ -#!genome-build R64-1-1 -#!genome-version R64-1-1 -#!genome-date 2011-09 -#!genome-build-accession GCA_000146045.2 -#!genebuild-last-updated 2018-10 -Mito sgd gene 731 802 . + . gene_id "tP(UGG)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 731 802 . + . gene_id "tP(UGG)Q"; transcript_id "tP(UGG)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 731 802 . + . gene_id "tP(UGG)Q"; transcript_id "tP(UGG)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tP(UGG)Q_tRNA-E1"; -Mito sgd gene 3952 4338 . + . gene_id "Q0010"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 3952 4338 . + . gene_id "Q0010"; transcript_id "Q0010_mRNA"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 3952 4338 . + . gene_id "Q0010"; transcript_id "Q0010_mRNA"; exon_number "1"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0010_mRNA-E1"; -Mito sgd CDS 3952 4335 . + 0 gene_id "Q0010"; transcript_id "Q0010_mRNA"; exon_number "1"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0010"; -Mito sgd start_codon 3952 3954 . + 0 gene_id "Q0010"; transcript_id "Q0010_mRNA"; exon_number "1"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 4336 4338 . + 0 gene_id "Q0010"; transcript_id "Q0010_mRNA"; exon_number "1"; gene_name "Q0010"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 4254 4415 . + . gene_id "Q0017"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 4254 4415 . + . gene_id "Q0017"; transcript_id "Q0017_mRNA"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 4254 4415 . + . gene_id "Q0017"; transcript_id "Q0017_mRNA"; exon_number "1"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0017_mRNA-E1"; -Mito sgd CDS 4254 4412 . + 0 gene_id "Q0017"; transcript_id "Q0017_mRNA"; exon_number "1"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0017"; -Mito sgd start_codon 4254 4256 . + 0 gene_id "Q0017"; transcript_id "Q0017_mRNA"; exon_number "1"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 4413 4415 . + 0 gene_id "Q0017"; transcript_id "Q0017_mRNA"; exon_number "1"; gene_name "Q0017"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 6546 8194 . + . gene_id "Q0020"; gene_name "15S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; -Mito sgd transcript 6546 8194 . + . gene_id "Q0020"; transcript_id "Q0020_rRNA"; gene_name "15S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; transcript_source "sgd"; transcript_biotype "rRNA"; -Mito sgd exon 6546 8194 . + . gene_id "Q0020"; transcript_id "Q0020_rRNA"; exon_number "1"; gene_name "15S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; transcript_source "sgd"; transcript_biotype "rRNA"; exon_id "Q0020_rRNA-E1"; -Mito sgd gene 9374 9447 . + . gene_id "tW(UCA)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 9374 9447 . + . gene_id "tW(UCA)Q"; transcript_id "tW(UCA)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 9374 9447 . + . gene_id "tW(UCA)Q"; transcript_id "tW(UCA)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tW(UCA)Q_tRNA-E1"; -Mito sgd gene 11667 11957 . + . gene_id "Q0032"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 11667 11957 . + . gene_id "Q0032"; transcript_id "Q0032_mRNA"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 11667 11957 . + . gene_id "Q0032"; transcript_id "Q0032_mRNA"; exon_number "1"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0032_mRNA-E1"; -Mito sgd CDS 11667 11954 . + 0 gene_id "Q0032"; transcript_id "Q0032_mRNA"; exon_number "1"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0032"; -Mito sgd start_codon 11667 11669 . + 0 gene_id "Q0032"; transcript_id "Q0032_mRNA"; exon_number "1"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 11955 11957 . + 0 gene_id "Q0032"; transcript_id "Q0032_mRNA"; exon_number "1"; gene_name "Q0032"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 13818 26701 . + . gene_id "Q0045"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 13818 26701 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 13818 13986 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "1"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E1"; -Mito sgd CDS 13818 13986 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "1"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; -Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "1"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 16435 16470 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "2"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E2"; -Mito sgd CDS 16435 16470 . + 2 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "2"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; -Mito sgd exon 18954 18991 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "3"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E3"; -Mito sgd CDS 18954 18991 . + 2 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "3"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; -Mito sgd exon 20508 20984 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "4"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E4"; -Mito sgd CDS 20508 20984 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "4"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; -Mito sgd exon 21995 22246 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "5"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E5"; -Mito sgd CDS 21995 22246 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "5"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; -Mito sgd exon 23612 23746 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "6"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E6"; -Mito sgd CDS 23612 23746 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "6"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; -Mito sgd exon 25318 25342 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "7"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E7"; -Mito sgd CDS 25318 25342 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "7"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; -Mito sgd exon 26229 26701 . + . gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "8"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0045_mRNA-E8"; -Mito sgd CDS 26229 26698 . + 2 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "8"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0045"; -Mito sgd stop_codon 26699 26701 . + 0 gene_id "Q0045"; transcript_id "Q0045_mRNA"; exon_number "8"; gene_name "COX1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 13818 19996 . + . gene_id "Q0060"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 13818 19996 . + . gene_id "Q0060"; transcript_id "Q0060_mRNA"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 13818 13986 . + . gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "1"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0060_mRNA-E1"; -Mito sgd CDS 13818 13986 . + 0 gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "1"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0060"; -Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "1"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 16435 16470 . + . gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "2"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0060_mRNA-E2"; -Mito sgd CDS 16435 16470 . + 2 gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "2"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0060"; -Mito sgd exon 18954 19996 . + . gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "3"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0060_mRNA-E3"; -Mito sgd CDS 18954 19993 . + 2 gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "3"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0060"; -Mito sgd stop_codon 19994 19996 . + 0 gene_id "Q0060"; transcript_id "Q0060_mRNA"; exon_number "3"; gene_name "AI3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 13818 16322 . + . gene_id "Q0050"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 13818 16322 . + . gene_id "Q0050"; transcript_id "Q0050_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 13818 16322 . + . gene_id "Q0050"; transcript_id "Q0050_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0050_mRNA-E1"; -Mito sgd CDS 13818 16319 . + 0 gene_id "Q0050"; transcript_id "Q0050_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0050"; -Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0050"; transcript_id "Q0050_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 16320 16322 . + 0 gene_id "Q0050"; transcript_id "Q0050_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 13818 18830 . + . gene_id "Q0055"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 13818 18830 . + . gene_id "Q0055"; transcript_id "Q0055_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 13818 13986 . + . gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0055_mRNA-E1"; -Mito sgd CDS 13818 13986 . + 0 gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0055"; -Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 16435 18830 . + . gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0055_mRNA-E2"; -Mito sgd CDS 16435 18827 . + 2 gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0055"; -Mito sgd stop_codon 18828 18830 . + 0 gene_id "Q0055"; transcript_id "Q0055_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 13818 23167 . + . gene_id "Q0070"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 13818 23167 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 13818 13986 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0070_mRNA-E1"; -Mito sgd CDS 13818 13986 . + 0 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0070"; -Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 16435 16470 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0070_mRNA-E2"; -Mito sgd CDS 16435 16470 . + 2 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0070"; -Mito sgd exon 18954 18991 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0070_mRNA-E3"; -Mito sgd CDS 18954 18991 . + 2 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0070"; -Mito sgd exon 20508 20984 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0070_mRNA-E4"; -Mito sgd CDS 20508 20984 . + 0 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0070"; -Mito sgd exon 21995 23167 . + . gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "5"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0070_mRNA-E5"; -Mito sgd CDS 21995 23164 . + 0 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "5"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0070"; -Mito sgd stop_codon 23165 23167 . + 0 gene_id "Q0070"; transcript_id "Q0070_mRNA"; exon_number "5"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 13818 21935 . + . gene_id "Q0065"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 13818 21935 . + . gene_id "Q0065"; transcript_id "Q0065_mRNA"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 13818 13986 . + . gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "1"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0065_mRNA-E1"; -Mito sgd CDS 13818 13986 . + 0 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "1"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0065"; -Mito sgd start_codon 13818 13820 . + 0 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "1"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 16435 16470 . + . gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "2"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0065_mRNA-E2"; -Mito sgd CDS 16435 16470 . + 2 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "2"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0065"; -Mito sgd exon 18954 18991 . + . gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "3"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0065_mRNA-E3"; -Mito sgd CDS 18954 18991 . + 2 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "3"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0065"; -Mito sgd exon 20508 21935 . + . gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "4"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0065_mRNA-E4"; -Mito sgd CDS 20508 21932 . + 0 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "4"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0065"; -Mito sgd stop_codon 21933 21935 . + 0 gene_id "Q0065"; transcript_id "Q0065_mRNA"; exon_number "4"; gene_name "AI4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 24156 25255 . + . gene_id "Q0075"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 24156 25255 . + . gene_id "Q0075"; transcript_id "Q0075_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 24156 24870 . + . gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0075_mRNA-E1"; -Mito sgd CDS 24156 24870 . + 0 gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0075"; -Mito sgd start_codon 24156 24158 . + 0 gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 24906 25255 . + . gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0075_mRNA-E2"; -Mito sgd CDS 24906 25252 . + 2 gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0075"; -Mito sgd stop_codon 25253 25255 . + 0 gene_id "Q0075"; transcript_id "Q0075_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 27666 27812 . + . gene_id "Q0080"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 27666 27812 . + . gene_id "Q0080"; transcript_id "Q0080_mRNA"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 27666 27812 . + . gene_id "Q0080"; transcript_id "Q0080_mRNA"; exon_number "1"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0080_mRNA-E1"; -Mito sgd CDS 27666 27809 . + 0 gene_id "Q0080"; transcript_id "Q0080_mRNA"; exon_number "1"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0080"; -Mito sgd start_codon 27666 27668 . + 0 gene_id "Q0080"; transcript_id "Q0080_mRNA"; exon_number "1"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 27810 27812 . + 0 gene_id "Q0080"; transcript_id "Q0080_mRNA"; exon_number "1"; gene_name "ATP8"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 28487 29266 . + . gene_id "Q0085"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 28487 29266 . + . gene_id "Q0085"; transcript_id "Q0085_mRNA"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 28487 29266 . + . gene_id "Q0085"; transcript_id "Q0085_mRNA"; exon_number "1"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0085_mRNA-E1"; -Mito sgd CDS 28487 29263 . + 0 gene_id "Q0085"; transcript_id "Q0085_mRNA"; exon_number "1"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0085"; -Mito sgd start_codon 28487 28489 . + 0 gene_id "Q0085"; transcript_id "Q0085_mRNA"; exon_number "1"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 29264 29266 . + 0 gene_id "Q0085"; transcript_id "Q0085_mRNA"; exon_number "1"; gene_name "ATP6"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 30874 31014 . + . gene_id "Q0092"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 30874 31014 . + . gene_id "Q0092"; transcript_id "Q0092_mRNA"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 30874 31014 . + . gene_id "Q0092"; transcript_id "Q0092_mRNA"; exon_number "1"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0092_mRNA-E1"; -Mito sgd CDS 30874 31011 . + 0 gene_id "Q0092"; transcript_id "Q0092_mRNA"; exon_number "1"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0092"; -Mito sgd start_codon 30874 30876 . + 0 gene_id "Q0092"; transcript_id "Q0092_mRNA"; exon_number "1"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 31012 31014 . + 0 gene_id "Q0092"; transcript_id "Q0092_mRNA"; exon_number "1"; gene_name "Q0092"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 35373 35444 . + . gene_id "tE(UUC)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 35373 35444 . + . gene_id "tE(UUC)Q"; transcript_id "tE(UUC)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 35373 35444 . + . gene_id "tE(UUC)Q"; transcript_id "tE(UUC)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tE(UUC)Q_tRNA-E1"; -Mito sgd gene 36540 42251 . + . gene_id "Q0120"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 36540 42251 . + . gene_id "Q0120"; transcript_id "Q0120_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 36540 36954 . + . gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0120_mRNA-E1"; -Mito sgd CDS 36540 36954 . + 0 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0120"; -Mito sgd start_codon 36540 36542 . + 0 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 37723 37736 . + . gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0120_mRNA-E2"; -Mito sgd CDS 37723 37736 . + 2 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0120"; -Mito sgd exon 39141 39217 . + . gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0120_mRNA-E3"; -Mito sgd CDS 39141 39217 . + 0 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0120"; -Mito sgd exon 40841 42251 . + . gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0120_mRNA-E4"; -Mito sgd CDS 40841 42248 . + 1 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0120"; -Mito sgd stop_codon 42249 42251 . + 0 gene_id "Q0120"; transcript_id "Q0120_mRNA"; exon_number "4"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 36540 38579 . + . gene_id "Q0110"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 36540 38579 . + . gene_id "Q0110"; transcript_id "Q0110_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 36540 36954 . + . gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0110_mRNA-E1"; -Mito sgd CDS 36540 36954 . + 0 gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0110"; -Mito sgd start_codon 36540 36542 . + 0 gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 37723 38579 . + . gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0110_mRNA-E2"; -Mito sgd CDS 37723 38576 . + 2 gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0110"; -Mito sgd stop_codon 38577 38579 . + 0 gene_id "Q0110"; transcript_id "Q0110_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 36540 40265 . + . gene_id "Q0115"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 36540 40265 . + . gene_id "Q0115"; transcript_id "Q0115_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 36540 36954 . + . gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0115_mRNA-E1"; -Mito sgd CDS 36540 36954 . + 0 gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0115"; -Mito sgd start_codon 36540 36542 . + 0 gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 37723 37736 . + . gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0115_mRNA-E2"; -Mito sgd CDS 37723 37736 . + 2 gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0115"; -Mito sgd exon 39141 40265 . + . gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0115_mRNA-E3"; -Mito sgd CDS 39141 40262 . + 0 gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0115"; -Mito sgd stop_codon 40263 40265 . + 0 gene_id "Q0115"; transcript_id "Q0115_mRNA"; exon_number "3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 36540 43647 . + . gene_id "Q0105"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 36540 43647 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 36540 36954 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "1"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E1"; -Mito sgd CDS 36540 36954 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "1"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; -Mito sgd start_codon 36540 36542 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "1"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 37723 37736 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "2"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E2"; -Mito sgd CDS 37723 37736 . + 2 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "2"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; -Mito sgd exon 39141 39217 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "3"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E3"; -Mito sgd CDS 39141 39217 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "3"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; -Mito sgd exon 40841 41090 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "4"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E4"; -Mito sgd CDS 40841 41090 . + 1 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "4"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; -Mito sgd exon 42508 42558 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "5"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E5"; -Mito sgd CDS 42508 42558 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "5"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; -Mito sgd exon 43297 43647 . + . gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "6"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0105_mRNA-E6"; -Mito sgd CDS 43297 43644 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "6"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0105"; -Mito sgd stop_codon 43645 43647 . + 0 gene_id "Q0105"; transcript_id "Q0105_mRNA"; exon_number "6"; gene_name "COB"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 46723 46953 . + . gene_id "Q0130"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 46723 46953 . + . gene_id "Q0130"; transcript_id "Q0130_mRNA"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 46723 46953 . + . gene_id "Q0130"; transcript_id "Q0130_mRNA"; exon_number "1"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0130_mRNA-E1"; -Mito sgd CDS 46723 46950 . + 0 gene_id "Q0130"; transcript_id "Q0130_mRNA"; exon_number "1"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0130"; -Mito sgd start_codon 46723 46725 . + 0 gene_id "Q0130"; transcript_id "Q0130_mRNA"; exon_number "1"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 46951 46953 . + 0 gene_id "Q0130"; transcript_id "Q0130_mRNA"; exon_number "1"; gene_name "OLI1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 48201 48290 . + . gene_id "tS(UGA)Q2"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 48201 48290 . + . gene_id "tS(UGA)Q2"; transcript_id "tS(UGA)Q2_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 48201 48290 . + . gene_id "tS(UGA)Q2"; transcript_id "tS(UGA)Q2_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tS(UGA)Q2_tRNA-E1"; -Mito sgd gene 48901 50097 . + . gene_id "Q0140"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 48901 50097 . + . gene_id "Q0140"; transcript_id "Q0140_mRNA"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 48901 50097 . + . gene_id "Q0140"; transcript_id "Q0140_mRNA"; exon_number "1"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0140_mRNA-E1"; -Mito sgd CDS 48901 50094 . + 0 gene_id "Q0140"; transcript_id "Q0140_mRNA"; exon_number "1"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0140"; -Mito sgd start_codon 48901 48903 . + 0 gene_id "Q0140"; transcript_id "Q0140_mRNA"; exon_number "1"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 50095 50097 . + 0 gene_id "Q0140"; transcript_id "Q0140_mRNA"; exon_number "1"; gene_name "VAR1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 51052 51228 . + . gene_id "Q0142"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 51052 51228 . + . gene_id "Q0142"; transcript_id "Q0142_mRNA"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 51052 51228 . + . gene_id "Q0142"; transcript_id "Q0142_mRNA"; exon_number "1"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0142_mRNA-E1"; -Mito sgd CDS 51052 51225 . + 0 gene_id "Q0142"; transcript_id "Q0142_mRNA"; exon_number "1"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0142"; -Mito sgd start_codon 51052 51054 . + 0 gene_id "Q0142"; transcript_id "Q0142_mRNA"; exon_number "1"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 51226 51228 . + 0 gene_id "Q0142"; transcript_id "Q0142_mRNA"; exon_number "1"; gene_name "Q0142"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 51277 51429 . + . gene_id "Q0143"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 51277 51429 . + . gene_id "Q0143"; transcript_id "Q0143_mRNA"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 51277 51429 . + . gene_id "Q0143"; transcript_id "Q0143_mRNA"; exon_number "1"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0143_mRNA-E1"; -Mito sgd CDS 51277 51426 . + 0 gene_id "Q0143"; transcript_id "Q0143_mRNA"; exon_number "1"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0143"; -Mito sgd start_codon 51277 51279 . + 0 gene_id "Q0143"; transcript_id "Q0143_mRNA"; exon_number "1"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 51427 51429 . + 0 gene_id "Q0143"; transcript_id "Q0143_mRNA"; exon_number "1"; gene_name "Q0143"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 54109 54438 . + . gene_id "Q0144"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 54109 54438 . + . gene_id "Q0144"; transcript_id "Q0144_mRNA"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 54109 54438 . + . gene_id "Q0144"; transcript_id "Q0144_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0144_mRNA-E1"; -Mito sgd CDS 54109 54435 . + 0 gene_id "Q0144"; transcript_id "Q0144_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0144"; -Mito sgd start_codon 54109 54111 . + 0 gene_id "Q0144"; transcript_id "Q0144_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 54436 54438 . + 0 gene_id "Q0144"; transcript_id "Q0144_mRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 58009 62447 . + . gene_id "Q0158"; gene_name "21S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; -Mito sgd transcript 58009 62447 . + . gene_id "Q0158"; transcript_id "Q0158_rRNA"; gene_name "21S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; transcript_source "sgd"; transcript_biotype "rRNA"; -Mito sgd exon 58009 60724 . + . gene_id "Q0158"; transcript_id "Q0158_rRNA"; exon_number "1"; gene_name "21S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; transcript_source "sgd"; transcript_biotype "rRNA"; exon_id "Q0158_rRNA-E1"; -Mito sgd exon 61868 62447 . + . gene_id "Q0158"; transcript_id "Q0158_rRNA"; exon_number "2"; gene_name "21S_RRNA"; gene_source "sgd"; gene_biotype "rRNA"; transcript_source "sgd"; transcript_biotype "rRNA"; exon_id "Q0158_rRNA-E2"; -Mito sgd gene 61022 61729 . + . gene_id "Q0160"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 61022 61729 . + . gene_id "Q0160"; transcript_id "Q0160_mRNA"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 61022 61729 . + . gene_id "Q0160"; transcript_id "Q0160_mRNA"; exon_number "1"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0160_mRNA-E1"; -Mito sgd CDS 61022 61726 . + 0 gene_id "Q0160"; transcript_id "Q0160_mRNA"; exon_number "1"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0160"; -Mito sgd start_codon 61022 61024 . + 0 gene_id "Q0160"; transcript_id "Q0160_mRNA"; exon_number "1"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 61727 61729 . + 0 gene_id "Q0160"; transcript_id "Q0160_mRNA"; exon_number "1"; gene_name "SCEI"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 63862 63937 . + . gene_id "tT(UGU)Q1"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 63862 63937 . + . gene_id "tT(UGU)Q1"; transcript_id "tT(UGU)Q1_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 63862 63937 . + . gene_id "tT(UGU)Q1"; transcript_id "tT(UGU)Q1_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tT(UGU)Q1_tRNA-E1"; -Mito sgd gene 64415 64490 . + . gene_id "tC(GCA)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 64415 64490 . + . gene_id "tC(GCA)Q"; transcript_id "tC(GCA)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 64415 64490 . + . gene_id "tC(GCA)Q"; transcript_id "tC(GCA)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tC(GCA)Q_tRNA-E1"; -Mito sgd gene 64596 64670 . + . gene_id "tH(GUG)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 64596 64670 . + . gene_id "tH(GUG)Q"; transcript_id "tH(GUG)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 64596 64670 . + . gene_id "tH(GUG)Q"; transcript_id "tH(GUG)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tH(GUG)Q_tRNA-E1"; -Mito sgd gene 65770 66174 . + . gene_id "Q0182"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 65770 66174 . + . gene_id "Q0182"; transcript_id "Q0182_mRNA"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 65770 66174 . + . gene_id "Q0182"; transcript_id "Q0182_mRNA"; exon_number "1"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0182_mRNA-E1"; -Mito sgd CDS 65770 66171 . + 0 gene_id "Q0182"; transcript_id "Q0182_mRNA"; exon_number "1"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0182"; -Mito sgd start_codon 65770 65772 . + 0 gene_id "Q0182"; transcript_id "Q0182_mRNA"; exon_number "1"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 66172 66174 . + 0 gene_id "Q0182"; transcript_id "Q0182_mRNA"; exon_number "1"; gene_name "Q0182"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 66095 66179 . + . gene_id "tL(UAA)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 66095 66179 . + . gene_id "tL(UAA)Q"; transcript_id "tL(UAA)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 66095 66179 . + . gene_id "tL(UAA)Q"; transcript_id "tL(UAA)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tL(UAA)Q_tRNA-E1"; -Mito sgd gene 66210 66285 . + . gene_id "tQ(UUG)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 66210 66285 . + . gene_id "tQ(UUG)Q"; transcript_id "tQ(UUG)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 66210 66285 . + . gene_id "tQ(UUG)Q"; transcript_id "tQ(UUG)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tQ(UUG)Q_tRNA-E1"; -Mito sgd gene 67061 67134 . + . gene_id "tK(UUU)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 67061 67134 . + . gene_id "tK(UUU)Q"; transcript_id "tK(UUU)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 67061 67134 . + . gene_id "tK(UUU)Q"; transcript_id "tK(UUU)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tK(UUU)Q_tRNA-E1"; -Mito sgd gene 67309 67381 . + . gene_id "tR(UCU)Q1"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 67309 67381 . + . gene_id "tR(UCU)Q1"; transcript_id "tR(UCU)Q1_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 67309 67381 . + . gene_id "tR(UCU)Q1"; transcript_id "tR(UCU)Q1_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tR(UCU)Q1_tRNA-E1"; -Mito sgd gene 67468 67542 . + . gene_id "tG(UCC)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 67468 67542 . + . gene_id "tG(UCC)Q"; transcript_id "tG(UCC)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 67468 67542 . + . gene_id "tG(UCC)Q"; transcript_id "tG(UCC)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tG(UCC)Q_tRNA-E1"; -Mito sgd gene 68322 68396 . + . gene_id "tD(GUC)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 68322 68396 . + . gene_id "tD(GUC)Q"; transcript_id "tD(GUC)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 68322 68396 . + . gene_id "tD(GUC)Q"; transcript_id "tD(GUC)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tD(GUC)Q_tRNA-E1"; -Mito sgd gene 69203 69288 . + . gene_id "tS(GCU)Q1"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 69203 69288 . + . gene_id "tS(GCU)Q1"; transcript_id "tS(GCU)Q1_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 69203 69288 . + . gene_id "tS(GCU)Q1"; transcript_id "tS(GCU)Q1_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tS(GCU)Q1_tRNA-E1"; -Mito sgd gene 69289 69362 . + . gene_id "tR(ACG)Q2"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 69289 69362 . + . gene_id "tR(ACG)Q2"; transcript_id "tR(ACG)Q2_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 69289 69362 . + . gene_id "tR(ACG)Q2"; transcript_id "tR(ACG)Q2_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tR(ACG)Q2_tRNA-E1"; -Mito sgd gene 69846 69921 . + . gene_id "tA(UGC)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 69846 69921 . + . gene_id "tA(UGC)Q"; transcript_id "tA(UGC)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 69846 69921 . + . gene_id "tA(UGC)Q"; transcript_id "tA(UGC)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tA(UGC)Q_tRNA-E1"; -Mito sgd gene 70162 70237 . + . gene_id "tI(GAU)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 70162 70237 . + . gene_id "tI(GAU)Q"; transcript_id "tI(GAU)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 70162 70237 . + . gene_id "tI(GAU)Q"; transcript_id "tI(GAU)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tI(GAU)Q_tRNA-E1"; -Mito sgd gene 70824 70907 . + . gene_id "tY(GUA)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 70824 70907 . + . gene_id "tY(GUA)Q"; transcript_id "tY(GUA)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 70824 70907 . + . gene_id "tY(GUA)Q"; transcript_id "tY(GUA)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tY(GUA)Q_tRNA-E1"; -Mito sgd gene 71433 71503 . + . gene_id "tN(GUU)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 71433 71503 . + . gene_id "tN(GUU)Q"; transcript_id "tN(GUU)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 71433 71503 . + . gene_id "tN(GUU)Q"; transcript_id "tN(GUU)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tN(GUU)Q_tRNA-E1"; -Mito sgd gene 72630 72705 . + . gene_id "tM(CAU)Q1"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 72630 72705 . + . gene_id "tM(CAU)Q1"; transcript_id "tM(CAU)Q1_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 72630 72705 . + . gene_id "tM(CAU)Q1"; transcript_id "tM(CAU)Q1_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tM(CAU)Q1_tRNA-E1"; -Mito sgd gene 73758 74513 . + . gene_id "Q0250"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 73758 74513 . + . gene_id "Q0250"; transcript_id "Q0250_mRNA"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 73758 74513 . + . gene_id "Q0250"; transcript_id "Q0250_mRNA"; exon_number "1"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0250_mRNA-E1"; -Mito sgd CDS 73758 74510 . + 0 gene_id "Q0250"; transcript_id "Q0250_mRNA"; exon_number "1"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0250"; -Mito sgd start_codon 73758 73760 . + 0 gene_id "Q0250"; transcript_id "Q0250_mRNA"; exon_number "1"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 74511 74513 . + 0 gene_id "Q0250"; transcript_id "Q0250_mRNA"; exon_number "1"; gene_name "COX2"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 74495 75984 . + . gene_id "Q0255"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 74495 75984 . + . gene_id "Q0255"; transcript_id "Q0255_mRNA"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 74495 75622 . + . gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "1"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0255_mRNA-E1"; -Mito sgd CDS 74495 75622 . + 0 gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "1"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0255"; -Mito sgd start_codon 74495 74497 . + 0 gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "1"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 75663 75872 . + . gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "2"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0255_mRNA-E2"; -Mito sgd CDS 75663 75872 . + 0 gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "2"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0255"; -Mito sgd exon 75904 75984 . + . gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "3"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0255_mRNA-E3"; -Mito sgd CDS 75904 75981 . + 0 gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "3"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0255"; -Mito sgd stop_codon 75982 75984 . + 0 gene_id "Q0255"; transcript_id "Q0255_mRNA"; exon_number "3"; gene_name "Q0255"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 77431 77505 . + . gene_id "tF(GAA)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 77431 77505 . + . gene_id "tF(GAA)Q"; transcript_id "tF(GAA)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 77431 77505 . + . gene_id "tF(GAA)Q"; transcript_id "tF(GAA)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tF(GAA)Q_tRNA-E1"; -Mito sgd gene 78089 78162 . - . gene_id "tT(UAG)Q2"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 78089 78162 . - . gene_id "tT(UAG)Q2"; transcript_id "tT(UAG)Q2_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 78089 78162 . - . gene_id "tT(UAG)Q2"; transcript_id "tT(UAG)Q2_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tT(UAG)Q2_tRNA-E1"; -Mito sgd gene 78533 78608 . + . gene_id "tV(UAC)Q"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 78533 78608 . + . gene_id "tV(UAC)Q"; transcript_id "tV(UAC)Q_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 78533 78608 . + . gene_id "tV(UAC)Q"; transcript_id "tV(UAC)Q_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tV(UAC)Q_tRNA-E1"; -Mito sgd gene 79213 80022 . + . gene_id "Q0275"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 79213 80022 . + . gene_id "Q0275"; transcript_id "Q0275_mRNA"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 79213 80022 . + . gene_id "Q0275"; transcript_id "Q0275_mRNA"; exon_number "1"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0275_mRNA-E1"; -Mito sgd CDS 79213 80019 . + 0 gene_id "Q0275"; transcript_id "Q0275_mRNA"; exon_number "1"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0275"; -Mito sgd start_codon 79213 79215 . + 0 gene_id "Q0275"; transcript_id "Q0275_mRNA"; exon_number "1"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 80020 80022 . + 0 gene_id "Q0275"; transcript_id "Q0275_mRNA"; exon_number "1"; gene_name "COX3"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd gene 85035 85112 . + . gene_id "tM(CAU)Q2"; gene_source "sgd"; gene_biotype "tRNA"; -Mito sgd transcript 85035 85112 . + . gene_id "tM(CAU)Q2"; transcript_id "tM(CAU)Q2_tRNA"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; -Mito sgd exon 85035 85112 . + . gene_id "tM(CAU)Q2"; transcript_id "tM(CAU)Q2_tRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "tRNA"; transcript_source "sgd"; transcript_biotype "tRNA"; exon_id "tM(CAU)Q2_tRNA-E1"; -Mito sgd gene 85295 85777 . + . gene_id "Q0285"; gene_source "sgd"; gene_biotype "ncRNA"; -Mito sgd transcript 85295 85777 . + . gene_id "Q0285"; transcript_id "Q0285_ncRNA"; gene_source "sgd"; gene_biotype "ncRNA"; transcript_source "sgd"; transcript_biotype "ncRNA"; -Mito sgd exon 85295 85777 . + . gene_id "Q0285"; transcript_id "Q0285_ncRNA"; exon_number "1"; gene_source "sgd"; gene_biotype "ncRNA"; transcript_source "sgd"; transcript_biotype "ncRNA"; exon_id "Q0285_ncRNA-E1"; -Mito sgd gene 85554 85709 . + . gene_id "Q0297"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; -Mito sgd transcript 85554 85709 . + . gene_id "Q0297"; transcript_id "Q0297_mRNA"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd exon 85554 85709 . + . gene_id "Q0297"; transcript_id "Q0297_mRNA"; exon_number "1"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "Q0297_mRNA-E1"; -Mito sgd CDS 85554 85706 . + 0 gene_id "Q0297"; transcript_id "Q0297_mRNA"; exon_number "1"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "Q0297"; -Mito sgd start_codon 85554 85556 . + 0 gene_id "Q0297"; transcript_id "Q0297_mRNA"; exon_number "1"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; -Mito sgd stop_codon 85707 85709 . + 0 gene_id "Q0297"; transcript_id "Q0297_mRNA"; exon_number "1"; gene_name "Q0297"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; diff --git a/srrTomat0/tests/artifacts/test.meme b/srrTomat0/tests/artifacts/test.meme deleted file mode 100644 index 7054798..0000000 --- a/srrTomat0/tests/artifacts/test.meme +++ /dev/null @@ -1,23 +0,0 @@ -MEME version 4 - -ALPHABET= ACGT - -strands: - + - -Background letter frequencies: -A 0.25000 C 0.25000 G 0.25000 T 0.25000 - -MOTIF M00799_2.00 Gata4 - -letter-probability matrix: alength= 4 w= 9 - 0.248650 0.261399 0.241301 0.248650 - 0.392226 0.219607 0.070234 0.317932 - 0.025762 0.011346 0.942829 0.020063 - 0.931184 0.018110 0.009175 0.041532 - 0.020910 0.020496 0.005004 0.953590 - 0.652610 0.066752 0.028247 0.252391 - 0.541666 0.110460 0.158325 0.189549 - 0.196060 0.286696 0.350172 0.167072 - 0.286130 0.225078 0.309961 0.178832 - -URL diff --git a/srrTomat0/tests/artifacts/test.motif b/srrTomat0/tests/artifacts/test.motif deleted file mode 100644 index b25c648..0000000 --- a/srrTomat0/tests/artifacts/test.motif +++ /dev/null @@ -1,10 +0,0 @@ ->CAGATAAGG M00799_2.00 5.408651 - 0.2487 0.2614 0.2413 0.2487 - 0.3922 0.2196 0.0702 0.3179 - 0.0258 0.0113 0.9428 0.0201 - 0.9312 0.0181 0.0092 0.0415 - 0.0209 0.0205 0.0050 0.9536 - 0.6526 0.0668 0.0282 0.2524 - 0.5417 0.1105 0.1583 0.1895 - 0.1961 0.2867 0.3502 0.1671 - 0.2861 0.2251 0.3100 0.1788 diff --git a/srrTomat0/tests/artifacts/test_alignment.fastq.gz b/srrTomat0/tests/artifacts/test_alignment.fastq.gz deleted file mode 100644 index fd8687da23c17247a6bb8208ec669bf8a7498472..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9201 zcmVZ02o73;%#&F9x%k|Hp`oAeE`^K6#$~p6m4Ot}dw>R}_;B27_7u zbnO58`0vZ>AO7;sugCI8+n=BJpa1iJ9ozEH^=RfCpew^RaQT?HV&9MRp)Z}kp2v!uXViLrucGIjXWPl=i|3Pa zzE;Zl^T26`zU6$Q9{%IM{qc`-KK6b8$9?dyv-osyYW5TG^N)WrE3BV^6<%vHBfJ_T z=;a*tRq;cw;n1g|S7aA+{?~p9-k?{iC(BXoG&PyM`~0$Z9(F{^X#^2k;dZrnY0_`$ zc8ybSg#5iB^e^aN7NHS7BO}bOw7yR(Y+GXmy|a2<=XbrI=N&&io2!RO26 zJbajS>=m_~+CZ&wz(v$b=X>ETcJ%z`Rv6ygA1?IAhxoDc4^Onhylx{iydTf?G{gJT zm_h9S6!%x`UYj@L_KJi9@Q(9l!&RzUtCZp=AM`785`U@cijYb4=1%9gJlT))_D#D? ze&_X9JxAK;$eeL#&|FTcnj`PIR95`xr zi1(Y#+P|3G44;u1-tTle!}+YSf_d;6*s4ar^QTx0M3k^)!GH7?UKwwEa{~NGrdEvoznb{cT!dd5p6}(x;E5i;or)bZOi2*VTXiUd{V05txUliKJRg zIFJj>-F0r+mD@xWy-o-#8yFj!Rj0KQ=h2Q zoDOmRIRA4iyhpyUJRa*ypZAf$uF}>%vZP+3x_C7kSfs3WsNjTh#FDX-u?p<&>)$!_ zj0&HCOPCq-s&BDwy5H#DRqjWwcSls1t`~htET4IlEI;q%e7ZbZX9ySffXJYz<|+1S z)pU6PP#_pqkEp%~Y&n@LSiZp@tz1WlkLP1bmJz4h^&uRbU&(T)Sfb2=hyX7*U*=Qw zfzwq^XTk5;0)*sDD?HOOc82AQkMpy|3>wKa@aY}ZaChK;UxT7Kyj`?8yShiIxaS5I zDlWS?&?-Py_BU)**lM%*XwD$rr8mzA4-(czM3Ju;3hA-^JJ*pg!zf$69_g)CNWB$o zr2`KXgFO0NV}xMg!%=Xrb;aA|ik$%nSrJ{u4wS=Th?(Jh(XxbKRp3rf`AnI*%q`rX zF#t(0WxL~3PQ1}bCz=|wW(#!!z4w+QG(p-QlHFc(D}2(x3eS%Q$$4G+ED;aFfeP@$ z*Gj#96F&2mBlUU`?=UYo9fj`zeoS3r`+GbbL@(o9`PN*yc8I~!Bk&OLV~JRSK#6aF z5^`YGcnCG%Xy5@@$`0zvI?O;wCKebF_K63?ZgK`#5abG7`;HWL9LeVyX|j91TPko< zrU`Gc=c1>CFRa5_V&0ZTpTD}mfLLc<&MdD0*o9_$wd`Q3gqiMExGrBb=GzLl-!(SqEl$(kTgH&3hDP;+^NKab zXtvk^D-3nJ71AhIp4Y#9)C}(*BE#3m@$19Q3E!V-6_&=wr16o}1HFFwAy}xx%a>Q; zp@;G>BEx5tCDx?1#GJOy3iN-w+bEh)A_ELC?e!aeaEu+Y=rR8xx-P$Z6ES7;3EhreE z)HN+Y20}o9;`=bbfTBa3h_R?Bh9C@0S|}|{J6IC7h_5v<1zTh2d5h7nUg{jSSnnGlMW8kQynP=QIc~?=x6fg-MWUWQ;#yElIu#u%jLb zsK(?deqTxjBT%6PQ4t_5hez73>*8#!=Yh?E|ksiZr zIU_WDZg~Q6p;&y^B_N&(SyvktW)7(b$rgZuYHI?RN-DFY1(3to7_%lo18$nTxp+XZ zdquWP+o+m&&uWEbz7-P=oGN$%E2TlK z2uL*4)rVgsfHy;7FF~N0ENZjpaDfg z+8GQ`Uq*BYAUOdT@VI{Ww`TG3BiTvW7zzar$SYxnQMO!+jb3gv1AM%oBqn_(;7~0E z9y9{@l%vTM$P$%rB@J!wHOQnk)1>PKHhn<|8cxK663RlK#{@sdfcgb*O|dxC4bG*I zcT`oh!uEI!^Tg-1XqfCsC$YQ8yG};l2u5&5&OZdHKp0&aDXi``;HY7d@u4{KupIMa zfb<36hvfG9H;;%Y_KR@I$mI<0bsh2yZ}W!Nc0&zDy8$Y*yW%`TbZeYVp|^l&vzn?f zc@l;j&=w~vZ0+34&@Oy>fpnZBZg-nR^fJjb!DTu_S)>Rv3`wVDJyYswhHbuu1~HVt zqOw+mO7A{ZWQu}A1|iL6Y|R`XDJ&pZAt4~84*FF=gakD(Pig;C;`&CWHhl+1SY;XRM8^8G-A;p+YQ_DH>JtsJ5Kv zvg82u?4%mupoKc3>4MG`N(wJFkYRzLfEaG7RidhFS{4UxazjG1t_n{2>dkn) z#kOV8YV6QWLS}V$7?rGUIxjn0q2=UrCL_D=4!t#K>Z^%G=J{*%>mI}Q?e?6{U!L>a zw=HqWAlME02@(XVW+XuelynC+v~Yg4_C+xS;qZxth1r)#+KG}(TJc&J8Au6Vnh1EG zJHAf*helO&FBm7ZHxYSae!NBn#r!@YgWBQQQ7$N&NT*j80u~;R0F)-sK(E4(CHgb~ z-Z0HUEKcGsnzv|Aq%?g&PX_(ep)FgWL1P)fnTLQ3j29jqkS2_&mplnyzU(lhA=CWD z48`k4@la$CeCkZrF;erGTl%p;#fpnyfaNV&aus8noqrsA=4hMahJozEcs| z+LW0cOxPti1dWmn_6BJ*4f8yT4)52po@RJHZp|tyn5>?z-jq4obg3or@WvB6YwR z_ChViapgtH!mTaQX+)Ts8+&tvZwWhye4LT)Iyx8oG%v%XnO>`ghm^WfyCou{QWV#) zP@E9571y+coCOQQIZV*F{sVtGmc`@(4uAzC4-6FL?9?K{L0qZfMad0g6@?`Y$-J2& zJUr7dJbY#g51+12L7D(MmIIkU2&j+4^=)!^hhC34EFi;D4R}|)Xa=e!bpm%ICUene*3b?>J+DJVJ%2vW^S5>B?La!Q0z+7B7)~!? z0b3FVx6!q;#K7!nLBO>w1yUDa2`6PIO5k=MGtG=zAN5?4FM=?CNisx5|^wC z(ZIUA2RJFDg@a9S3nK+IOsQdq{NqJWUHu%9aB>2|F!W`Ny#X958w2QS>@e(Ar04s6 zw*#ukLIhn3gP=VprG%!wfSE509xeclHn?{!OF>Zq?ZZK0Ho^n$YrevLx(w2NQ+i2P z#}v(q<}5v6!F4j?YQH*Hb9#+v`gwjo&#bw=J5i)!Xe@dJ19J8t>MJWM@Fi|r5Gj;GQrgISPX4_DP8SyAs4PPgD^xoiWGhf~iWAi&>e;)RSSv)c-f@!F z(Al{8Ns^bJMqa6~VYj};t|>IDxKeSJ))vM~N`oHKSUA3&$l`}^8M)liVRc16 zGKvI%C|)@;NO4V!mucwQ8Nq2aM|C?SxoS-mi2@yV>5*b?Fq@#1EZCNSZD& zdMEJ&J_aZf#xOC1j~!!S+t1{->Big z37^tE&sP~iPV(aD&M=wutBI4_d)Ul;Z_m>V^V+$?-{>K%HCzFMYNHZGP3{gL>Kk%} z6*f6*D894;cSeRCEd>%ew4K-+(Ow(xV zTkU0`!tL-LH}d91DY@6&he=RTg&hme8kI95=tNYn8A{X+0XDDFqs;Xh0dh|vd-ih5X~xN$76uTeD^kQyY3#@^TU(bu3^j3QcsR4> ztK*`C0$?|Sl@8Xd0&qh0O%@IX2(luj^{%64i{c1L1SZh*^>BwKU+mi47RHI30vKd= zs`h=C3q#sV*JKG)$H7fzix6Rkq2yyp+e??lv&)(VnI3iF0YMu&vevZoC}0GG_43fd zJE)Wek%jI**f`qIiqeCe+$03QT3hDnN`gE4lDIAkovxIc5ke={rV!0oSlkUdn05-G zGVJgkF|_MvTh$Edn`}`^6YNX(6!gq%1rIW=~eBJ8J#{DL#$ zs;!9hYt1}qazEja7A$i?$>lK|JlmEfnTWdAiiUWH)N5hLJKRr-bJbRN`k)B`5*j!~ zB0>sIq)tT2w~OLFh6~9{H0aRkhkQNVp=Y9mdJare)guIO zl@h@Mvg&66T!jG6PN{{1Y#d$_xq>9i176(ZjvzrXP3?_uSZPOJ`*cYNt=zUQ&KV-p zbh*QGlrz_5)&i@SzmRkW)qKF;Ia>?T6+_5u{x)QbhZU(I5<7^YO1z*ik$4Ct;P-0y z|I8y_%D81-h9k!<0$SN7$$jG;iH!RCgN&b8&Et!^10K;wt!b3$- zfyNzg{8dmHvb4B~kqjHT!2lzSM{2IP?iQ06utRES$=jn%rh7z(A0N$ew12EC6n1!x z>dbXnA8Lj!Mshq#I3{6oCkNQV_8f0GF1u80Xd$uVMWN;t!_Jm_*#NQU%oUTX72@J4 ze^Mw!8?|F6A$gcw3=^*;^A8_5LJ51BVW|9Awsldix~s3OTPKq|%-T?piyyMcyullWaWnRajN?9sLe zI035WW+tzcGLs21Aifkvxza5=-_VdbVH*&EW^9Xl^9k!wLSk>Hd$zKvjg%3qG!b;913O z8KarsKfG6em%)&l>|S%lx%*c3Gn-{Gthcwcm>E9389>lu_y~G1i*(fsP0xlLJFOVX zV7m&_-e`!j&>~gm$kH%W2KUr-V0Ooni4cN=1=_e1o=kE!`F1xAL1TIvabo7$0cu&t z+kwrSZhn-uZYL}w3s6`T07Qs5Kvq_aTKcX&5ZxCjF4OhRc>j*q7i*0g}wq7V$I5!cpY3O}8 zQF4%ohA6a_D@PpSs=y)v9uI0Ii*SpeLJ%bxim1u?12473prJE!<^+=q#jo6%yE-hR z0M9o34gx&FgOaWze|Rb#w{<)J6oTalm26YHgxp&hg=zrz7mTfO@W;o-H8vgUJLlbYdqOPaX*7VS$+qm_E_5Hx}XY!vyX zQZZN{A*e!h6z6AvH*G^TOj=l0q;43&AUCr$NC8$!X2>IBG!ZJ{l(C}$TB17|yc&BG z8b6mijHj@)r+M0;YwZ#fi}xpM_0ckVxj3za3{ibTT|BQyWDg1LF@FIwiA}+P5SncN zJy^*1Co(k%WB>M4{lj60Gz^m`q<+{&A04K)9A6-bgM`8EJmwX-?#VxA*)=FjDu?z)5VfS| zFk8G{W`@4e@AR5bMeS9zDWplTATU^EUeU~{E~LcUV(UDZ(PzI6OU1x$Fsp5Fyx-~Q zL`c*6y`D@+8Wv3OpCOKGd$cqzE!NN5>~Py34NAvrVROOOjFv%!(9^>X+t?28w5TfT z?;?Qt`O$8d*Y>d*XlUDlwK%i$j`oq=6RuZm9R-N`jS22^MQE5@=guE0^`1}*vM(jX z3^nGE$v+E4vFK{Kl8^usyQiS_7izJ{7HJAsn0(BDSUWO-xyF?i_CCWnIg9PU3`*wA zza4gXjdwixyw6&5>vn{Kp09X|2Gq3p)G|~6;ui`Q({~eO?~>rjIU4707$ZXQpujg# zK~H1sJD6r&@)8)ca{`nU)71IO~%^}MWVSHceg@)kK| zZ+92F+E|};C zne(j#<(k~xvA3HM2!1uXLJ%>su73c^NDcc$JvBHNZ26*3jj^zzj5{>PhMJFgNKB?j zQg`TNg0*_PB|u|LCtDxOT>y+WH$bSY%nF8X$W~rsTMOq5LZVPBG);h?7EjY2{Ee zy^3XtflEi#pdgdg$`f53kV1py1X7l@rMSK!a6}_36O5W*cgZXrdU!s@XmZvX>;3Gv z{P@p|?Elv6swYj74tq=|;)YWc7OGh1Rl;2+ZU+`+*7denT)8pBaDRj6@_gLw@Er=a zrUNI{qR)G&r*Q_G-)K?@woD7ZanCSg_)OZZQSmo^gdK)*$>+9hYKE?TbBRVdl96#WN*q%Dj9J5+lFTQ9pmm)9A|Ha8*Rmi zDUBIM$>L=@?)GNjieT#wyuD|~Xy(xi3O1=f!Z!}@?W5s`(RMQS&e(oYONx8)Z#}x^ zc7H^=v)l~xW2gk$wse}I-+1_N;3q(XsFa}kI=E{;MhXhlaKbGp2`^A&u&*x&Acb`s z@(Z8?_%~Amb4_Ewh?^<<$f8x+w#||Ex3Nc>B~QWLN^}g6Ti`LTy}0Lbb9Y_enlsni z1QJ54h{pVz8jkOOa_%99MWW{^=gyl>h%F0*LZ+Gre-wz%frQz(!8>J z+bbk<{A+H9Wwy^l#?kQY$; zp#Rq+LINmhG&O5=v7k4m*g`)2h!!0?16!Lcj~BqcW}a_4{=D3^?40;aJ3 zr(9+j3d7EYbJka%N1rPM5b^LQ)$i^RA|G#FkXb>HqzE4yxZCMiU=YfE(^L?rwcOdAgxG3R9jYA*t z?WaerA*7A>Aoa9wiyl_1BbI(?To3Ld_0xvkKR(gZL*Kh2vIqNvuXT9yO zNmpRsBF2@2nDZ#e0^$e;8k^2l-=J79mlf?E;r8Xei5}Y$J314|^$TdfI01dtOjO2t zFPBTmXR{jd1d?xjj>nj+e?4@={!i+oq|`5y>j-cpBRHe%D9OUb&y(v`Rk+^VJSNKR z<|a=up3kNX?|HYWWaxZwX1bu3yAt>1RPrXmLwXK}`e#iRpKhN~VaI{gG-9Po?I~da zA%P|*wv;^rB~(sbpzv_H1KNT#gsX^wjsbu&qoMD!el@4~OUMKrY#LZV<7%>vn|Fzd zP2YHZtjl;=ygn)(eLwEywyzBHfA8q?hh@p)t^R^y9J(hiq2|r)!wb}->GV$x82GEg zAap;NS{9hZM;Y-~!w%bUXm@$8Z)}KjUltYqPcf*?9CP2Ra`2k@sRTrg+}Fa}S8Rj( z^;d2^iAoIhHa5Jo>8c4rs2DoLMd9P&uC5;PDaRCJT9vDXe~Zh*`dMLz;isA`ujh## z^b@f9tXaLLD!;oYvHVqg5`Q;|H7vLTFXt;xG{@Q+4L`#UV|D$i@!@6%bSSC);ZCD2 z^+5);s}as=aV&>q9a4(VwHMdOIhZ7jXt$o-uZ=8lzbOm#OS{|W#yH5ACw@|-&26Zc zTpl{beJ0PFjQDRYBQJ&Z2HHKht+I??Z-x*}Oly}WU(Dczl(>i%!69FgkA25S!i2p3 z`5v;f^XomoPdju(lxM^8_v_;PuuHkLcN`I-xpvdN!B|ss_7330eCfNw?9zMy);6v+ zw^y>P(RXhe^lSx^>qaqbiBJ6@sB{8fNKir(<&^C8&9x-rI||x^@WD!!q z^YFrCZ%TJdxtW`zIE-2!^N_87KHnNAw{GmDu;T*N1GvV6QPD>U5CFv|QL3|#kwXX| zYE-5CGzS+FH~#Q_PmfOU)4?xcqNN?oLoPcEtBYlRuIdk4-_84bb}WQS!8E}VA{YCV z_lup<=iE6-GpwL6XI*;HmT&~?@nfe!)T)Ln!VW{>==^-Y7mb!T4U8eagBh4I!o!&kN3)u*<)dckM^+CnB7eF4**8Tb zx)qIz9PJ;5RIoEgFZ~GJqnA`Jk-`E%GHlOioj-%fwk%183Ui$8Mw{z%xF2zPZBP5u zoBIT_t>)sY1}ehDcM$a`2&l;!XDG}MYDSqXA&TLkGgnH0s3@2{SsG)VWM>CLt__J@ zJLiUDLmUMP-w6jYj@I<*VTR#{h<-i~T@?N92K=PY;-jkA2FVAzQD{I87!#a`Asnr- zH3OY}C4-G!k_!7h(ohfAU>XN0u5H9{x$!I6$=Fp2Bvj01dDGi*Xw+EicqA=7=ZEnR z_buE1CVNmHEJO@ch)Y0jP*M=n62l%?sLW8{pRBz5seq1_20_40_100_500 -AAAGAATTCGAATTCATGTCGAAAGCTACATATAAGAATTCGAATTCGGAACGTGCTGCTACTCATCCTAGTCCTGTTGCTGCCAAGGAATTCGAATTC -CTATTTAATATCATGCACGAAAAGCAAACAAACTTGTGTGCTTCATTGGATGTTCGTACCACCAAGGAATTACTGGAGTTAGTTGAAGCATTAGGTCCC -AGAATTCGAATTCGGGTTTGTTTACTAAAAACACATGTGGATATCTTGACTGATTTTTCCATGGAGGGCACAGTTAAGCCGCTAAAGGCATTATCCGCC -AAGTACAATTTTTTACTCTTCGAAGACAGAAAATTTGCTGACATTGGTAATACAGTCAAATTGCAGTACTCTGCGGGTGTATACAGAATAGCAGAATGG -GCAGACATTACGAATGCACACGGTGTGGTGGGCCCAGGTATTGTTAGCGGTTTGAAGCAGGCGGCGGAAGAAGTAACAAAGGAACCTAGAGGCCTTTTG -ATGTTAGCAGAATTGTCATGCAAGGGCTCCCTAGCTACTGGAGAATATACTAAGGGTACTGTTGACATTGCGAAGAGCGACAAAGATTTTGTTATCGGC -TTTATTGCTCAAAGAGACATGGGTGGAAGAGATGAAGGTTACGATTGGTTGATTATGACACCCGGTGTGGGTTTAGATGACAAGGGAGACGCATTGGGT -CAACAGTATAGAACCGTGGAATTCGAATTCGATGATGTGGTCTCTACAGGATCTGACATTATTATTGTTGGAAGAGGACTATTTGCAAAGGGAAGGGAT -GCTAAGGTAGAGGGTGAACGTTACAGAAAAGCAGGCTGGGAAGCATATTTGAGAAGATGCGGCCAGCAAAACTAA diff --git a/srrTomat0/tests/test_motif.py b/srrTomat0/tests/test_motif.py deleted file mode 100644 index 9926101..0000000 --- a/srrTomat0/tests/test_motif.py +++ /dev/null @@ -1,111 +0,0 @@ -import unittest -import os -import io -import pandas as pd -import numpy.testing as npt -import copy - -from srrTomat0.motifs import meme, homer_motif - -artifact_path = os.path.join(os.path.abspath(os.path.expanduser(os.path.dirname(__file__))), "artifacts") - -MEME_FILE_NAME = "test.meme" -MOTIF_FILE_NAME = "test.motif" -PWM_FILE_NAME = "M00799_2.00.txt" -ECORI_FILE_NAME = "test_ecori.meme" - -TEST_MOTIF_MATRIX = """\ -0.248650039776609 0.26139859992769 0.241301320519092 0.248650039776609 -0.392226269785661 0.219606847798542 0.0702344472606129 0.317932435155184 -0.0257615986027584 0.0113462234969035 0.942829024734492 0.0200631531658465 -0.931183694119653 0.0181098604626899 0.00917476304082202 0.0415316823768348 -0.0209101275685474 0.0204960793014344 0.00500398009361691 0.953589813036401 -0.652610298711976 0.0667517267238459 0.0282467996828702 0.252391174881308 -0.541666052707409 0.110459581183674 0.158324966820951 0.189549399287965 -0.196060485729292 0.286696226860535 0.350171562229744 0.167071725180429 -0.286129811839987 0.225077681884186 0.309960724838339 0.178831781437488 -""" - -PWM = pd.read_csv(io.StringIO(TEST_MOTIF_MATRIX), sep="\t", index_col=None, header=None).values - - -class TestMotifParsers(unittest.TestCase): - - def test_meme_loader(self): - - meme_file_name = os.path.join(artifact_path, MEME_FILE_NAME) - motifs = meme.read(meme_file_name) - - with open(meme_file_name) as meme_fh: - motifs2 = meme.read(meme_fh) - - self.assertEqual(len(motifs), 1) - npt.assert_array_almost_equal(motifs[0].probability_matrix, PWM, 4) - npt.assert_array_almost_equal(motifs2[0].probability_matrix, PWM, 4) - - self.assertListEqual(motifs[0].alphabet, list("ACGT")) - self.assertEqual(motifs[0].alphabet_len, 4) - self.assertAlmostEqual(motifs[0].information_content, 6.082, 3) - - def test_homer_motif_loader(self): - - motif_file_name = os.path.join(artifact_path, MOTIF_FILE_NAME) - motifs = homer_motif.read(motif_file_name) - - with open(motif_file_name) as meme_fh: - motifs2 = homer_motif.read(meme_fh) - - self.assertEqual(len(motifs), 1) - npt.assert_array_almost_equal(motifs[0].probability_matrix, PWM, 4) - npt.assert_array_almost_equal(motifs2[0].probability_matrix, PWM, 4) - - self.assertListEqual(motifs[0].alphabet, list("ACGT")) - self.assertEqual(motifs[0].alphabet_len, 4) - self.assertAlmostEqual(motifs[0].information_content, 6.082, 3) - - def test_pwm_loader(self): - - motif_file_name = os.path.join(artifact_path, MOTIF_FILE_NAME) - motifs = homer_motif.read(motif_file_name) - - with open(motif_file_name) as meme_fh: - motifs2 = homer_motif.read(meme_fh) - - self.assertEqual(len(motifs), 1) - npt.assert_array_almost_equal(motifs[0].probability_matrix, PWM, 4) - npt.assert_array_almost_equal(motifs2[0].probability_matrix, PWM, 4) - - self.assertListEqual(motifs[0].alphabet, list("ACGT")) - self.assertEqual(motifs[0].alphabet_len, 4) - self.assertAlmostEqual(motifs[0].information_content, 6.082, 3) - - -class TestMotifProps(unittest.TestCase): - - @classmethod - def setUpClass(cls): - - meme_file_name = os.path.join(artifact_path, ECORI_FILE_NAME) - cls.master_motif = meme.read(meme_file_name)[0] - - def setUp(self): - - self.motif = copy.deepcopy(self.master_motif) - - def test_information_content(self): - - self.assertEqual(self.motif.information_content, 24.0) - - def test_consensus(self): - - self.assertEqual(self.motif.consensus, "GAATTCGAATTC") - - def test_length(self): - - self.assertEqual(len(self.motif), 12) - - def test_score(self): - - self.assertEqual(self.motif.score_match("GAATTCGAATTC"), 24.0) - self.assertEqual(self.motif.score_match("GAATTCCTTAAG"), 12.0) - self.assertEqual(self.motif.score_match("CTTAAGCTTAAG"), 0.0) diff --git a/srrTomat0/tests/test_motif_scan.py b/srrTomat0/tests/test_motif_scan.py deleted file mode 100644 index 51e9d4e..0000000 --- a/srrTomat0/tests/test_motif_scan.py +++ /dev/null @@ -1,88 +0,0 @@ -import unittest -import os -import io -import pandas as pd -import numpy as np -import numpy.testing as npt - -from srrTomat0.motifs._motif import __MotifScanner as MotifScanner -from srrTomat0.motifs import Motif, fimo, homer, SCAN_SCORE_COL - -artifact_path = os.path.join(os.path.abspath(os.path.expanduser(os.path.dirname(__file__))), "artifacts") - -FASTA_FILE_NAME = os.path.join(artifact_path, "test_motif_search.fasta") -BED_FILE_NAME = os.path.join(artifact_path, "test_motif_search.bed") - - -TEST_MOTIF_MATRIX = """\ -0.0\t0.0\t1.0\t0.0 -1.0\t0.0\t0.0\t0.0 -1.0\t0.0\t0.0\t0.0 -0.0\t0.0\t0.0\t1.0 -0.0\t0.0\t0.0\t1.0 -0.0\t1.0\t0.0\t0.0 -0.0\t0.0\t1.0\t0.0 -1.0\t0.0\t0.0\t0.0 -1.0\t0.0\t0.0\t0.0 -0.0\t0.0\t0.0\t1.0 -0.0\t0.0\t0.0\t1.0 -0.0\t1.0\t0.0\t0.0 -""" - -MOTIF_OBJ = Motif("EcoRI", "EcoRI", list("ACGT")) -MOTIF_OBJ.probability_matrix = pd.read_csv(io.StringIO(TEST_MOTIF_MATRIX), sep="\t", header=None, index_col=None)\ - .astype(float)\ - .values - -MOTIF_STARTS = [3, 35, 87, 199, 711] - - -class TestScan(unittest.TestCase): - - def test_base(self): - - scanner = MotifScanner(motifs=[MOTIF_OBJ], num_workers=1) - - with self.assertRaises(NotImplementedError): - scanner.scan(BED_FILE_NAME, FASTA_FILE_NAME, min_ic=8) - - with self.assertRaises(NotImplementedError): - scanner._preprocess(8) - - with self.assertRaises(NotImplementedError): - scanner._parse_output(None) - - with self.assertRaises(NotImplementedError): - scanner._get_motifs(None, None) - - def test_fimo(self): - scanner = fimo.FIMOScanner(motifs=[MOTIF_OBJ], num_workers=1) - motif_locs = scanner.scan(BED_FILE_NAME, FASTA_FILE_NAME, min_ic=8) - - self.assertEqual(motif_locs.shape[0], 10) - self.assertEqual(motif_locs.loc[motif_locs[fimo.FIMO_STRAND] == "+", :].shape[0], 5) - self.assertEqual(motif_locs.loc[motif_locs[fimo.FIMO_STRAND] == "-", :].shape[0], 5) - - self.assertListEqual(motif_locs.loc[motif_locs[fimo.FIMO_STRAND] == "+", fimo.FIMO_START].tolist(), - MOTIF_STARTS) - self.assertListEqual(motif_locs.loc[motif_locs[fimo.FIMO_STRAND] == "-", fimo.FIMO_START].tolist(), - MOTIF_STARTS) - - npt.assert_array_almost_equal(np.array([24.0] * 10), motif_locs[SCAN_SCORE_COL].values) - - def test_homer(self): - scanner = homer.HOMERScanner(motifs=[MOTIF_OBJ], num_workers=1) - motif_locs = scanner.scan(BED_FILE_NAME, FASTA_FILE_NAME, min_ic=8) - - self.assertEqual(motif_locs.shape[0], 10) - self.assertEqual(motif_locs.loc[motif_locs[homer.HOMER_STRAND] == "+", :].shape[0], 5) - self.assertEqual(motif_locs.loc[motif_locs[homer.HOMER_STRAND] == "-", :].shape[0], 5) - - self.assertListEqual(motif_locs.loc[motif_locs[homer.HOMER_STRAND] == "+", homer.HOMER_START] - .tolist(), - MOTIF_STARTS) - self.assertListEqual(motif_locs.loc[motif_locs[homer.HOMER_STRAND] == "-", homer.HOMER_START].sort_values() - .tolist(), - MOTIF_STARTS) - - npt.assert_array_almost_equal(np.array([24.0] * 10), motif_locs[SCAN_SCORE_COL].values) diff --git a/srrTomat0/tests/test_srr.py b/srrTomat0/tests/test_srr.py deleted file mode 100644 index afad3e1..0000000 --- a/srrTomat0/tests/test_srr.py +++ /dev/null @@ -1,50 +0,0 @@ -import tempfile -import shutil -import os -import unittest - -from srrTomat0.processor.srr import get_srr_files, unpack_srr_files - -TEST_SRR_IDS = ["SRR053325"] - - -class TestSRR(unittest.TestCase): - - srr_ids = TEST_SRR_IDS - temp_path = None - - @classmethod - def setUpClass(cls): - cls.temp_path = tempfile.mkdtemp() - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.temp_path, ignore_errors=True) - - def test_srr_get_success(self): - srr_files = get_srr_files(self.srr_ids, self.temp_path, prefetch_options=["--transport", "http"]) - self.assertTrue(os.path.exists(srr_files[0])) - self.assertEqual(os.path.getsize(srr_files[0]), 31838) - - srr_files_2 = get_srr_files(self.srr_ids, self.temp_path) - self.assertEqual(srr_files[0], srr_files_2[0]) - - def test_srr_get_fail(self): - self.assertIsNone(get_srr_files([""], os.path.join(self.temp_path, "blah", "blah"), - prefetch_options=["--transport", "http"])[0]) - - def test_srr_unpack_success(self): - srr_files = get_srr_files(self.srr_ids, self.temp_path, prefetch_options=["--transport", "http"]) - fastq_files = unpack_srr_files(self.srr_ids, srr_files, self.temp_path) - self.assertTrue(all(map(lambda x: os.path.exists(x), fastq_files[0]))) - self.assertEqual(len(fastq_files[0]), 3) - - fastq_files2 = unpack_srr_files(self.srr_ids, srr_files, self.temp_path) - self.assertListEqual(fastq_files[0], fastq_files2[0]) - - def test_srr_unpack_fail(self): - self.assertListEqual(unpack_srr_files(self.srr_ids, [""], self.temp_path)[0], [None]) - - def test_srr_unpack_skip(self): - self.assertListEqual(unpack_srr_files(self.srr_ids, [None], self.temp_path)[0], [None]) - diff --git a/srrTomat0/tests/test_star.py b/srrTomat0/tests/test_star.py deleted file mode 100644 index fbe70e7..0000000 --- a/srrTomat0/tests/test_star.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import tempfile -import unittest -import shutil - -import pysam - -from srrTomat0.processor.star import star_mkref, star_align_fastqs - -GENOME_FILE_NAME = "sc64_mito.fasta" -ANNOTATION_FILE_NAME = "sc64_mito.gtf" -TEST_FASTQ = "test_alignment.fastq" - -STAR_REF_FILES = ["chrLength.txt", "chrName.txt", "exonGeTrInfo.tab", "geneInfo.tab", "genomeParameters.txt", "SA", - "sjdbInfo.txt", "sjdbList.out.tab", "chrNameLength.txt", "chrStart.txt", "exonInfo.tab", "Genome", - "SAindex", "sjdbList.fromGTF.out.tab", "transcriptInfo.tab"] - -artifact_path = os.path.join(os.path.abspath(os.path.expanduser(os.path.dirname(__file__))), "artifacts") - - -class TestSTAR(unittest.TestCase): - genome_file = os.path.join(artifact_path, GENOME_FILE_NAME) - annotation_file = os.path.join(artifact_path, ANNOTATION_FILE_NAME) - fastq_file = os.path.join(artifact_path, TEST_FASTQ) - - temp_path = None - star_ref_path = None - sam_out_path = None - - @classmethod - def setUpClass(cls): - cls.temp_path = tempfile.mkdtemp() - cls.star_ref_path = os.path.join(cls.temp_path, "star") - cls.sam_out_path = os.path.join(cls.temp_path, "sam") - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.temp_path, ignore_errors=True) - - def test_star(self): - self._step_mkref() - sam_files = self._step_align() - self._step_test_alignment(sam_files) - - def _step_mkref(self): - out_path = star_mkref(self.star_ref_path, genome_file=[self.genome_file], annotation_file=self.annotation_file, - move_files=False) - - self.assertFalse(out_path is None) - self.assertTrue(out_path == self.star_ref_path) - - for file_name in STAR_REF_FILES: - self.assertTrue(os.path.exists(os.path.join(out_path, file_name))) - - def _step_align(self): - sam_files = star_align_fastqs(["TEST"], [[self.fastq_file]], self.star_ref_path, self.sam_out_path) - - self.assertFalse(sam_files[0] is None) - - return sam_files - - def _step_test_alignment(self, sam_files): - samfile = pysam.AlignmentFile(sam_files[0], "r") - reads = [aln for aln in samfile.fetch()] - - self.assertEqual(len(reads), 9) - self.assertEqual(sum(map(lambda x: x.is_reverse, reads)), 3) - - self.assertListEqual(list(map(lambda x: x.reference_start, reads)), - [28620, 30480, 3078, 4893, 5465, 5545, 37969, 37969, 38033]) - self.assertListEqual(list(map(lambda x: x.reference_end, reads)), - [28719, 30579, 3177, 4992, 5564, 5644, 38068, 38068, 38132]) diff --git a/srrTomat0/tests/test_utils.py b/srrTomat0/tests/test_utils.py deleted file mode 100644 index 71d6aad..0000000 --- a/srrTomat0/tests/test_utils.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -import shutil -import tempfile -import unittest -from urllib.error import URLError - -import srrTomat0.processor.utils as utils - - -class TestUtils(unittest.TestCase): - temp_path = None - - @classmethod - def setUpClass(cls): - cls.temp_path = tempfile.mkdtemp() - cls.star_ref_path = os.path.join(cls.temp_path, "star") - cls.sam_out_path = os.path.join(cls.temp_path, "sam") - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.temp_path, ignore_errors=True) - - def test_ncbi_genomes(self): - fasta, gtf = utils.get_genome_file_locs("sc64") - self.assertEqual(gtf[1], "sc64.gtf.gz") - - with self.assertRaises(ValueError): - utils.get_genome_file_locs("not_a_real_thing") - - def test_get_file_from_url(self): - fasta, gtf = utils.get_genome_file_locs("sc64") - target_path = os.path.join(self.temp_path, "test.gtf.gz") - file_path = utils.get_file_from_url(gtf[0], target_path) - self.assertEqual(file_path, target_path) - self.assertTrue(os.path.exists(file_path)) - - with self.assertRaises(URLError): - file_path = utils.get_file_from_url(gtf[0] + "does_not_exist.file", target_path) - - def test_requirements(self): - self.assertTrue(utils.test_requirements_exist(test_targets=["python"], - test_package={"python": ("python", ["python", "--version"])}, - test_htseq=False, test_chroma=False)) - - self.assertTrue(utils.test_requirements_exist(test_targets=["python"], - test_package={"python": ("python", ["python", "--version"])}, - test_htseq=True, test_chroma=True)) - - with self.assertRaises(FileNotFoundError): - utils.test_requirements_exist(test_targets=["not_a-.thing"], - test_package={"not_a-.thing": ("fake", ["not_a-.thing", "--version"])}, - test_htseq=False, test_chroma=False) - - def test_file_path_abs(self): - self.assertEqual(os.path.abspath(os.path.expanduser("~")), utils.file_path_abs("~")) From 3315ea904b382883c9889bb5e1afc1bea573f51c Mon Sep 17 00:00:00 2001 From: asistradition Date: Sun, 4 Oct 2020 12:59:11 -0400 Subject: [PATCH 15/20] Include strand in dupe check --- inferelator_prior/ATACTomat0.py | 87 ----------- inferelator_prior/ChIPBedTomat0.py | 145 ------------------ ...qTomat0.py => expression_data_from_srr.py} | 0 inferelator_prior/motifs/fimo.py | 3 +- inferelator_prior/motifs/homer.py | 3 +- .../tests/artifacts/test.transfac.dat | 0 .../artifacts/test_motif_search.fasta.fai | 1 - .../tests/artifacts/test_motif_search.gtf | 0 inferelator_prior/tests/test_prior.py | 0 9 files changed, 4 insertions(+), 235 deletions(-) delete mode 100644 inferelator_prior/ATACTomat0.py delete mode 100755 inferelator_prior/ChIPBedTomat0.py rename inferelator_prior/{RNASeqTomat0.py => expression_data_from_srr.py} (100%) create mode 100644 inferelator_prior/tests/artifacts/test.transfac.dat delete mode 100644 inferelator_prior/tests/artifacts/test_motif_search.fasta.fai create mode 100644 inferelator_prior/tests/artifacts/test_motif_search.gtf create mode 100644 inferelator_prior/tests/test_prior.py diff --git a/inferelator_prior/ATACTomat0.py b/inferelator_prior/ATACTomat0.py deleted file mode 100644 index 8ec53d5..0000000 --- a/inferelator_prior/ATACTomat0.py +++ /dev/null @@ -1,87 +0,0 @@ -import argparse -import os - -import pandas as pd - -from inferelator_prior.processor.utils import file_path_abs -from inferelator_prior.processor.srr import get_srr_files, unpack_srr_files -from inferelator_prior.processor.star import star_align_fastqs -from inferelator_prior.processor.samtools import sam_sort - -from inferelator_prior import SRR_SUBPATH, FASTQ_SUBPATH, STAR_ALIGNMENT_SUBPATH, BAM_SUBPATH - - -OUTPUT_MATRIX_FILE_NAME = "atac_matrix.tsv" - -COUNT_FILE_METAINDEXES = ["N_unmapped", "N_multimapping", "N_noFeature", "N_ambiguous"] -COUNT_FILE_HEADER = ["Total", "MinusStrand", "PlusStrand"] -COUNT_FILE_HEADER_FOR_OUTPUT = "Total" - - -def main(): - ap = argparse.ArgumentParser(description="Turn ATAC-seq expression SRRs from NCBI GEO into a prior matrix") - ap.add_argument("-s", "--srr", dest="srr", help="SRR record IDs", nargs="+", metavar="SRRID", default=None) - ap.add_argument("-f", "--file", dest="file", help="List of SRR records in a TXT file", metavar="FILE", default=None) - ap.add_argument("-g", "--genome", dest="genome", help="STAR reference genome", metavar="PATH", required=True) - ap.add_argument("-o", "--out", dest="out", help="Output PATH", metavar="PATH", required=True) - ap.add_argument("--gzip", dest="gzip", help="GZIP output file", action='store_const', const=True, default=False) - - args = ap.parse_args() - srr_ids = list() - - if args.srr is None and args.file is None: - print("One of --srr or --file must be set") - exit(1) - elif args.srr is not None and args.file is not None: - print("Only one of --srr or --file may be set (not both)") - exit(1) - elif args.srr is not None: - # SRR IDs are provided at command line - srr_ids = args.srr - elif args.file is not None: - # SRR IDs are in a .txt file; read them into a list - srr_ids = pd.read_csv(args.file, sep="\t", index_col=None, header=None).iloc[:, 0].tolist() - else: - raise ValueError("There is something wrong with this switch") - - atac_tomat0(srr_ids, args.out, args.genome, gzip_output=args.gzip) - - -def atac_tomat0(srr_ids, output_path, star_reference_genome, gzip_output=False, cores=4, star_jobs=2, star_args=None, - min_quality=None): - - star_args = [] if star_args is None else star_args - - output_path = file_path_abs(output_path) - os.makedirs(output_path, exist_ok=True) - - # Download all the SRR files - print("Downloading SRR files") - os.makedirs(os.path.join(output_path, SRR_SUBPATH), exist_ok=True) - srr_file_names = get_srr_files(srr_ids, os.path.join(output_path, SRR_SUBPATH), num_workers=cores) - - # Unpack all the SRR files into FASTQ files - print("Unpacking SRR files") - os.makedirs(os.path.join(output_path, FASTQ_SUBPATH), exist_ok=True) - fastq_file_names = unpack_srr_files(srr_ids, srr_file_names, os.path.join(output_path, FASTQ_SUBPATH), - num_workers=cores) - - # Run all the FASTQ files through STAR to align - print("Aligning FASTQ files") - os.makedirs(os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), exist_ok=True) - thread_count = max(int(cores / len(srr_ids)), int(cores / star_jobs)) - sam_file_names = star_align_fastqs(srr_ids, fastq_file_names, star_reference_genome, - os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), - num_workers=star_jobs, threads_per_worker=thread_count, star_options=star_args) - - # Sort all the SAM files into BAM files - print("Sorting SAM files into BAM files") - os.makedirs(os.path.join(output_path, BAM_SUBPATH), exist_ok=True) - bam_file_names = sam_sort(srr_ids, sam_file_names, os.path.join(output_path, BAM_SUBPATH), min_quality=min_quality, - num_workers=cores) - - -if __name__ == '__main__': - main() - - diff --git a/inferelator_prior/ChIPBedTomat0.py b/inferelator_prior/ChIPBedTomat0.py deleted file mode 100755 index 0e900d3..0000000 --- a/inferelator_prior/ChIPBedTomat0.py +++ /dev/null @@ -1,145 +0,0 @@ -import argparse -import pandas as pd - -import pybedtools - -from inferelator_prior.processor.gtf import load_gtf_to_dataframe, SEQ_START, SEQ_STOP, GTF_GENENAME, SEQ_TSS -from inferelator_prior.processor.utils import file_path_abs -from inferelator_prior.processor.bedtools import get_peaks_in_features - -# Column names -BED_CHROMOSOME = 'chrom' -SEQ_COUNTS = 'count' -SEQ_BIN = 'bin' - -# Quantiles for bin -PEAK_QUANTILES = [0.25, 0.5, 0.75, 1] - - -def main(): - ap = argparse.ArgumentParser(description="Load peaks and genes.") - ap.add_argument("-f", "--file", dest="file", help="TSV file with ID|BED PATH pairs", metavar="FILE", default=None) - ap.add_argument("-b", "--bed", dest="bed", help="BED file containing ChIP peaks", nargs="+", metavar="FILE", - default=None) - ap.add_argument("-a", "--annotation", dest="anno", help="GTF/GFF Annotation File", metavar="FILE", required=True) - ap.add_argument("-o", "--out", dest="out", help="Output TSV PATH", metavar="PATH", required=True) - ap.add_argument("-w", "--window", dest="window_size", help="Window size", type=int, default=0) - ap.add_argument("-g", "--genebody", dest="gene", help="Gene body", action="store_const", const=True, default=False) - ap.add_argument("-t", "--tss", dest="tss", help="Transcription start site", action="store_const", const=True, - default=False) - args = ap.parse_args() - - if args.bed is None and args.file is None: - print("One of --bed or --file must be set") - exit(1) - elif args.bed is not None and args.file is not None: - print("Only one of --bed or --file may be set (not both)") - exit(1) - elif args.bed is not None: - # SRR IDs are provided at command line - id_names = args.bed - chip_bed_files = args.bed - elif args.file is not None: - # SRR IDs are in a .txt file; read them into a list - chip_samples = pd.read_csv(args.file, sep="\t", index_col=None, header=None) - if chip_samples.shape[1] != 2: - print("The TSV file must have two columns: ID and File_Path") - id_names = chip_samples.iloc[:, 0].tolist() - chip_bed_files = chip_samples.iloc[:, 1].tolist() - else: - raise ValueError("There is something wrong with this switch") - - if args.gene is False and args.tss is False: - print("One of --genebody or --tss must be set") - exit(1) - elif args.gene is not False and args.tss is not False: - print("Only one of --genebody or --tss may be set (not both)") - exit(1) - - chip_bed_tomat0(id_names, chip_bed_files, args.anno, output_path=args.out, window_size=args.window_size, - gene_body_flag=args.gene, tss_flag=args.tss) - - -def chip_bed_tomat0(id_names, chip_peaks_file, annotation_file, output_path=None, window_size=0, gene_body_flag = False, - tss_flag = False): - """ - Process a BED file of peaks into a integer peak-count matrix - :param chip_peaks_file: list(str) - List of paths to a BED file - :param output_path: str - Path to the output TSV file - :param annotation_file: str - Path to the GTF annotation file - :param window_size: int - Window on each side of a gene to include a peak in the count - 100 means 100bp up from start and 100bp down from end - :return gene_counts: pd.DataFrame - Integer count matrix of peaks per gene - """ - - # Convert paths to absolutes - output_path = file_path_abs(output_path) - annotation_file = file_path_abs(annotation_file) - - # Load annotations into a dataframe with pybedtools - # Adjust the start and stop positions to account for a flanking window - genes = load_gtf_to_dataframe(annotation_file) - - if gene_body_flag: - genes = open_window(genes, window_size) - if tss_flag: - genes = open_tss(genes, window_size) - - - prior_data = pd.DataFrame(index=genes[GTF_GENENAME]) - for id_name, peak_file in zip(id_names, chip_peaks_file): - # Load BED file into a dataframe with pybedtools - peak_file = file_path_abs(peak_file) - chip_peaks = pybedtools.BedTool(peak_file).to_dataframe() - gene_counts = get_peaks_in_features(genes, chip_peaks) - - # Get non-zero quantiles and use them to bin peak overlap by length - quantiles = gene_counts.loc[gene_counts[SEQ_COUNTS] != 0, SEQ_COUNTS].quantile(PEAK_QUANTILES) - gene_counts[SEQ_BIN] = 0 - - for i, qval in enumerate(quantiles.sort_values(ascending=True)): - gene_counts.loc[gene_counts[SEQ_COUNTS] >= qval, SEQ_BIN] = i + 1 - - # Rename the column with ID and reindex for join - gene_counts = gene_counts.rename({SEQ_BIN: id_name}).set_index(GTF_GENENAME).drop([SEQ_COUNTS], axis=1) - prior_data = prior_data.join(gene_counts, on=[GTF_GENENAME]) - - if output_path is not None: - prior_data.to_csv(output_path, sep="\t") - - return prior_data - - -def open_window(annotation_dataframe, window_size): - """ - This needs to adjust the start and stop in the annotation dataframe with window sizes - :param annotation_dataframe: pd.DataFrame - :param window_size: int - :return windowed_dataframe: pd.DataFrame - """ - windowed_dataframe = annotation_dataframe.copy() - windowed_dataframe[SEQ_START] = windowed_dataframe[SEQ_START] - window_size - windowed_dataframe[SEQ_STOP] = windowed_dataframe[SEQ_STOP] + window_size - windowed_dataframe.loc[windowed_dataframe[SEQ_START] < 0, SEQ_START] = 0 - return windowed_dataframe - -def open_tss(annotation_file, window_size): - """ - This needs to adjust the start and stop in the annotation dataframe with window sizes - :param annotation_dataframe: pd.DataFrame - :param window_size: int - :return windowed_dataframe: pd.DataFrame - """ - tss_dataframe = annotation_file.copy() - tss_dataframe[SEQ_START] = tss_dataframe[SEQ_TSS] - window_size - tss_dataframe[SEQ_STOP] = tss_dataframe[SEQ_TSS] + window_size - tss_dataframe.loc[tss_dataframe[SEQ_START] < 0, SEQ_START] = 0 - return tss_dataframe - -if __name__ == '__main__': - main() diff --git a/inferelator_prior/RNASeqTomat0.py b/inferelator_prior/expression_data_from_srr.py similarity index 100% rename from inferelator_prior/RNASeqTomat0.py rename to inferelator_prior/expression_data_from_srr.py diff --git a/inferelator_prior/motifs/fimo.py b/inferelator_prior/motifs/fimo.py index 30344b8..2af820b 100644 --- a/inferelator_prior/motifs/fimo.py +++ b/inferelator_prior/motifs/fimo.py @@ -31,7 +31,8 @@ def _preprocess(self, min_ic=None): return chunk_motifs(meme, self.motifs, num_workers=self.num_workers, min_ic=min_ic) def _postprocess(self, motif_peaks): - motif_peaks = motif_peaks.drop_duplicates(subset=[FIMO_MOTIF, FIMO_START, FIMO_STOP, FIMO_CHROMOSOME]) + motif_peaks = motif_peaks.drop_duplicates(subset=[FIMO_MOTIF, FIMO_START, FIMO_STOP, FIMO_CHROMOSOME, + FIMO_STRAND]) return motif_peaks def _get_motifs(self, fasta_file, motif_file, threshold=None): diff --git a/inferelator_prior/motifs/homer.py b/inferelator_prior/motifs/homer.py index 04a23a7..4f905bb 100644 --- a/inferelator_prior/motifs/homer.py +++ b/inferelator_prior/motifs/homer.py @@ -32,7 +32,8 @@ def _preprocess(self, min_ic=None): return chunk_motifs(homer_motif, self.motifs, num_workers=self.num_workers, min_ic=min_ic) def _postprocess(self, motif_peaks): - motif_peaks = motif_peaks.drop_duplicates(subset=[HOMER_MOTIF, HOMER_START, HOMER_STOP, HOMER_CHROMOSOME]) + motif_peaks = motif_peaks.drop_duplicates(subset=[HOMER_MOTIF, HOMER_START, HOMER_STOP, HOMER_CHROMOSOME, + HOMER_STRAND]) return motif_peaks def _get_motifs(self, fasta_file, motif_file, threshold=None): diff --git a/inferelator_prior/tests/artifacts/test.transfac.dat b/inferelator_prior/tests/artifacts/test.transfac.dat new file mode 100644 index 0000000..e69de29 diff --git a/inferelator_prior/tests/artifacts/test_motif_search.fasta.fai b/inferelator_prior/tests/artifacts/test_motif_search.fasta.fai deleted file mode 100644 index bc6177b..0000000 --- a/inferelator_prior/tests/artifacts/test_motif_search.fasta.fai +++ /dev/null @@ -1 +0,0 @@ -seq1_20_40_100_500 867 20 99 100 diff --git a/inferelator_prior/tests/artifacts/test_motif_search.gtf b/inferelator_prior/tests/artifacts/test_motif_search.gtf new file mode 100644 index 0000000..e69de29 diff --git a/inferelator_prior/tests/test_prior.py b/inferelator_prior/tests/test_prior.py new file mode 100644 index 0000000..e69de29 From 16e0862989dba0b4f04d8d67c8b06ea18b06d936 Mon Sep 17 00:00:00 2001 From: asistradition Date: Sun, 4 Oct 2020 12:59:31 -0400 Subject: [PATCH 16/20] Rename --- inferelator_prior/network_from_motifs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inferelator_prior/network_from_motifs.py b/inferelator_prior/network_from_motifs.py index a5d4b6d..347972b 100644 --- a/inferelator_prior/network_from_motifs.py +++ b/inferelator_prior/network_from_motifs.py @@ -1,6 +1,6 @@ from inferelator_prior.processor.gtf import (load_gtf_to_dataframe, open_window, GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND) -from inferelator_prior.processor.prior import build_prior_from_atac_motifs, MotifScorer +from inferelator_prior.processor.prior import build_prior_from_motifs, MotifScorer from inferelator_prior.motifs.motif_scan import MotifScan from inferelator_prior.motifs import motifs_to_dataframe, INFO_COL, MOTIF_NAME_COL from inferelator_prior.processor._species_constants import SPECIES_MAP @@ -148,8 +148,8 @@ def build_atac_motif_prior(motif_file, atac_bed_file, annotation_file, genomic_f # Processing into prior print("Processing TF binding sites into prior") MotifScorer.set_information_criteria(min_binding_ic=motif_ic, max_dist=tandem) - prior_edges, prior_matrix, raw_matrix = build_prior_from_atac_motifs(genes, motif_peaks, motif_information, - num_workers=num_cores) + prior_edges, prior_matrix, raw_matrix = build_prior_from_motifs(genes, motif_peaks, motif_information, + num_workers=num_cores) print("Prior matrix with {n} edges constructed".format(n=prior_edges.shape[0])) return prior_edges, prior_matrix, raw_matrix From 4c8745089408bd535244d48b6ca8e7883287ad62 Mon Sep 17 00:00:00 2001 From: asistradition Date: Sun, 4 Oct 2020 12:59:47 -0400 Subject: [PATCH 17/20] New tests --- .../tests/artifacts/test.transfac.dat | 47 ++++++++ .../tests/artifacts/test_motif_search.bed | 2 +- .../tests/artifacts/test_motif_search.fasta | 2 +- .../artifacts/test_motif_search.fasta.fai | 1 + .../tests/artifacts/test_motif_search.gtf | 10 ++ inferelator_prior/tests/test_motif.py | 25 +++- inferelator_prior/tests/test_motif_scan.py | 4 +- inferelator_prior/tests/test_prior.py | 110 ++++++++++++++++++ 8 files changed, 193 insertions(+), 8 deletions(-) create mode 100644 inferelator_prior/tests/artifacts/test_motif_search.fasta.fai diff --git a/inferelator_prior/tests/artifacts/test.transfac.dat b/inferelator_prior/tests/artifacts/test.transfac.dat index e69de29..2da9353 100644 --- a/inferelator_prior/tests/artifacts/test.transfac.dat +++ b/inferelator_prior/tests/artifacts/test.transfac.dat @@ -0,0 +1,47 @@ +VV TRANSFAC MATRIX TABLE, TEST MATRIX +XX +// +AC M00001 +XX +ID I$GATA4 +XX +DT 10.10.2020 (created); ewi. +DT 10.10.2020 (updated); sla. +CO CC-BY +XX +NA GATA4 +XX +DE GATA binding protein 4 +XX +TY specific +XX +OS Vertebrata +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata +XX +CL zinc-finger; zinc-finger-CAGATAAGG +XX +HP M00804; V$GATA4; (family). +XX +BF P43694; GATA4; Species: human, Homo sapiens; site(s) included: no +BF Q08369; GATA4; Species: mouse, Mus musculus; site(s) included: yes. +XX +P0 A C G T +01 25 26 24 25 N +02 39 22 7 32 W +03 3 1 94 2 G +04 93 2 1 4 A +05 2 2 1 95 T +06 65 7 3 25 A +07 54 11 16 19 A +08 20 29 35 16 N +09 29 22 31 18 N +XX +BA 100 sequences (matrix compiled from individual genomic sites) +XX +CC generated by TEST; binding sites in 100 genes +XX +SR M00184; V$MYOD_Q6; specific. +XX +PR embryo profile +XX +// diff --git a/inferelator_prior/tests/artifacts/test_motif_search.bed b/inferelator_prior/tests/artifacts/test_motif_search.bed index adb63ad..d8f67f0 100644 --- a/inferelator_prior/tests/artifacts/test_motif_search.bed +++ b/inferelator_prior/tests/artifacts/test_motif_search.bed @@ -1,2 +1,2 @@ -seq1_20_40_100_500 1 740 +seq1 1 740 diff --git a/inferelator_prior/tests/artifacts/test_motif_search.fasta b/inferelator_prior/tests/artifacts/test_motif_search.fasta index d51f61e..fa8e586 100644 --- a/inferelator_prior/tests/artifacts/test_motif_search.fasta +++ b/inferelator_prior/tests/artifacts/test_motif_search.fasta @@ -1,4 +1,4 @@ ->seq1_20_40_100_500 +>seq1 AAAGAATTCGAATTCATGTCGAAAGCTACATATAAGAATTCGAATTCGGAACGTGCTGCTACTCATCCTAGTCCTGTTGCTGCCAAGGAATTCGAATTC CTATTTAATATCATGCACGAAAAGCAAACAAACTTGTGTGCTTCATTGGATGTTCGTACCACCAAGGAATTACTGGAGTTAGTTGAAGCATTAGGTCCC AGAATTCGAATTCGGGTTTGTTTACTAAAAACACATGTGGATATCTTGACTGATTTTTCCATGGAGGGCACAGTTAAGCCGCTAAAGGCATTATCCGCC diff --git a/inferelator_prior/tests/artifacts/test_motif_search.fasta.fai b/inferelator_prior/tests/artifacts/test_motif_search.fasta.fai new file mode 100644 index 0000000..2f86454 --- /dev/null +++ b/inferelator_prior/tests/artifacts/test_motif_search.fasta.fai @@ -0,0 +1 @@ +seq1 867 6 99 100 diff --git a/inferelator_prior/tests/artifacts/test_motif_search.gtf b/inferelator_prior/tests/artifacts/test_motif_search.gtf index e69de29..599c8ee 100644 --- a/inferelator_prior/tests/artifacts/test_motif_search.gtf +++ b/inferelator_prior/tests/artifacts/test_motif_search.gtf @@ -0,0 +1,10 @@ +#!genome-build R64-1-1 +#!genome-version R64-1-1 +#!genome-date 2011-09 +#!genome-build-accession GCA_000146045.2 +#!genebuild-last-updated 2018-10 +seq1 sgd gene 100 500 . + . gene_id "T3ST"; gene_name "T3ST"; gene_source "sgd"; gene_biotype "protein_coding"; +seq1 sgd transcript 100 500 . + . gene_id "T3ST"; transcript_id "T3ST_mRNA"; gene_name "T3ST"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; +seq1 sgd exon 100 500 . + . gene_id "T3ST"; transcript_id "T3ST_mRNA"; exon_number "1"; gene_name "T3ST"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; exon_id "T3ST_mRNA-E1"; +seq1 sgd CDS 100 500 . + 0 gene_id "T3ST"; transcript_id "T3ST_mRNA"; exon_number "1"; gene_name "T3ST"; gene_source "sgd"; gene_biotype "protein_coding"; transcript_source "sgd"; transcript_biotype "protein_coding"; protein_id "T3ST"; + diff --git a/inferelator_prior/tests/test_motif.py b/inferelator_prior/tests/test_motif.py index bad2920..0e41436 100644 --- a/inferelator_prior/tests/test_motif.py +++ b/inferelator_prior/tests/test_motif.py @@ -5,11 +5,12 @@ import numpy.testing as npt import copy -from inferelator_prior.motifs import meme, homer_motif +from inferelator_prior.motifs import meme, homer_motif, transfac artifact_path = os.path.join(os.path.abspath(os.path.expanduser(os.path.dirname(__file__))), "artifacts") MEME_FILE_NAME = "test.meme" +TRANSFAC_FILE_NAME = "test.transfac.dat" MOTIF_FILE_NAME = "test.motif" PWM_FILE_NAME = "M00799_2.00.txt" ECORI_FILE_NAME = "test_ecori.meme" @@ -45,7 +46,7 @@ def test_meme_loader(self): self.assertListEqual(motifs[0].alphabet, list("ACGT")) self.assertEqual(motifs[0].alphabet_len, 4) - self.assertAlmostEqual(motifs[0].information_content, 6.082, 3) + self.assertAlmostEqual(motifs[0].information_content, 7.6857, 3) def test_homer_motif_loader(self): @@ -61,7 +62,7 @@ def test_homer_motif_loader(self): self.assertListEqual(motifs[0].alphabet, list("ACGT")) self.assertEqual(motifs[0].alphabet_len, 4) - self.assertAlmostEqual(motifs[0].information_content, 6.082, 3) + self.assertAlmostEqual(motifs[0].information_content, 7.6857, 3) def test_pwm_loader(self): @@ -77,7 +78,23 @@ def test_pwm_loader(self): self.assertListEqual(motifs[0].alphabet, list("ACGT")) self.assertEqual(motifs[0].alphabet_len, 4) - self.assertAlmostEqual(motifs[0].information_content, 6.082, 3) + self.assertAlmostEqual(motifs[0].information_content, 7.6857, 3) + + def test_transfac_loader(self): + + motif_file_name = os.path.join(artifact_path, TRANSFAC_FILE_NAME) + motifs = transfac.read(motif_file_name) + + with open(motif_file_name) as fh: + motifs2 = transfac.read(fh) + + self.assertEqual(len(motifs), 1) + npt.assert_array_almost_equal(motifs[0].probability_matrix, PWM, 2) + npt.assert_array_almost_equal(motifs2[0].probability_matrix, PWM, 2) + + self.assertListEqual(motifs[0].alphabet, list("ACGT")) + self.assertEqual(motifs[0].alphabet_len, 4) + self.assertAlmostEqual(motifs[0].information_content, 7.6535, 3) class TestMotifProps(unittest.TestCase): diff --git a/inferelator_prior/tests/test_motif_scan.py b/inferelator_prior/tests/test_motif_scan.py index ed86977..1f57731 100644 --- a/inferelator_prior/tests/test_motif_scan.py +++ b/inferelator_prior/tests/test_motif_scan.py @@ -57,7 +57,7 @@ def test_base(self): def test_fimo(self): scanner = fimo.FIMOScanner(motifs=[MOTIF_OBJ], num_workers=1) - motif_locs = scanner.scan(BED_FILE_NAME, FASTA_FILE_NAME, min_ic=8) + motif_locs = scanner.scan(FASTA_FILE_NAME, atac_bed_file=BED_FILE_NAME, min_ic=8) self.assertEqual(motif_locs.shape[0], 10) self.assertEqual(motif_locs.loc[motif_locs[fimo.FIMO_STRAND] == "+", :].shape[0], 5) @@ -72,7 +72,7 @@ def test_fimo(self): def test_homer(self): scanner = homer.HOMERScanner(motifs=[MOTIF_OBJ], num_workers=1) - motif_locs = scanner.scan(BED_FILE_NAME, FASTA_FILE_NAME, min_ic=8) + motif_locs = scanner.scan(FASTA_FILE_NAME, atac_bed_file=BED_FILE_NAME, min_ic=8) self.assertEqual(motif_locs.shape[0], 10) self.assertEqual(motif_locs.loc[motif_locs[homer.HOMER_STRAND] == "+", :].shape[0], 5) diff --git a/inferelator_prior/tests/test_prior.py b/inferelator_prior/tests/test_prior.py index e69de29..20755fc 100644 --- a/inferelator_prior/tests/test_prior.py +++ b/inferelator_prior/tests/test_prior.py @@ -0,0 +1,110 @@ +import os +import unittest +import pandas as pd + +from inferelator_prior.motifs import meme, motifs_to_dataframe, MotifScan, fimo, MOTIF_NAME_COL, SCAN_SCORE_COL +from inferelator_prior.processor import prior, gtf + +artifact_path = os.path.join(os.path.abspath(os.path.expanduser(os.path.dirname(__file__))), "artifacts") + +ECORI_FILE_NAME = os.path.join(artifact_path, "test_ecori.meme") +FASTA_FILE_NAME = os.path.join(artifact_path, "test_motif_search.fasta") +BED_FILE_NAME = os.path.join(artifact_path, "test_motif_search.bed") +GTF_FILE_NAME = os.path.join(artifact_path, "test_motif_search.gtf") +MOTIF_STARTS = [3, 35, 87, 199, 711] + + +class TestPriorPipeline(unittest.TestCase): + + def setUp(self): + self.motifs = meme.read(ECORI_FILE_NAME) + self.genes = gtf.load_gtf_to_dataframe(GTF_FILE_NAME) + self.motif_information = motifs_to_dataframe(self.motifs) + + def test_prior_agg_by_base(self): + motif_peaks, _ = self.do_scan_prior(20) + motif_peaks[fimo.FIMO_START] = [7, 13, 19, 1] + motif_peaks[fimo.FIMO_STOP] = [18, 24, 30, 12] + motif_peaks[MOTIF_NAME_COL] = 'ECORI' + + agg_peaks = prior.MotifScorer._agg_per_base(motif_peaks) + self.assertEqual(agg_peaks[fimo.FIMO_START].values[0], 1) + self.assertEqual(agg_peaks[fimo.FIMO_STOP].values[0], 30) + self.assertEqual(agg_peaks[prior.SCAN_SCORE_COL].values[0], 48) + + motif_peaks[fimo.FIMO_START] = [7, 13, 1, 19] + motif_peaks[fimo.FIMO_STOP] = [18, 24, 12, 30] + + agg_peaks = prior.MotifScorer._agg_per_base(motif_peaks) + + self.assertEqual(agg_peaks[fimo.FIMO_START].values[0], 1) + self.assertEqual(agg_peaks[fimo.FIMO_STOP].values[0], 30) + self.assertEqual(agg_peaks[prior.SCAN_SCORE_COL].values[0], 60) + + def test_prior_no_tandem_20_window(self): + prior.MotifScorer.set_information_criteria(min_binding_ic=8, max_dist=0) + motif_peaks, (prior_edges, prior_matrix, raw_matrix) = self.do_scan_prior(20) + self.assertEqual(motif_peaks.shape[0], 4) + self.assertEqual(prior_edges.shape[0], 1) + self.assertEqual(prior_edges['score'].values[0], 24.) + + def test_prior_no_tandem_200_window(self): + prior.MotifScorer.set_information_criteria(min_binding_ic=8, max_dist=0) + motif_peaks, (prior_edges, prior_matrix, raw_matrix) = self.do_scan_prior(200) + self.assertEqual(motif_peaks.shape[0], 12) + self.assertEqual(prior_edges.shape[0], 1) + self.assertEqual(prior_edges['score'].values[0], 24.) + + def test_prior_50_tandem_200_window(self): + prior.MotifScorer.set_information_criteria(min_binding_ic=8, max_dist=50) + motif_peaks, (prior_edges, prior_matrix, raw_matrix) = self.do_scan_prior(200) + + self.assertEqual(motif_peaks.shape[0], 12) + self.assertEqual(prior_edges.shape[0], 1) + self.assertEqual(prior_edges['score'].values[0], 72.) + + def test_prior_50_tandem_10000_window(self): + prior.MotifScorer.set_information_criteria(min_binding_ic=8, max_dist=50) + motif_peaks, (prior_edges, prior_matrix, raw_matrix) = self.do_scan_prior(10000) + self.assertEqual(motif_peaks.shape[0], 14) + self.assertEqual(prior_edges.shape[0], 1) + self.assertEqual(prior_edges['score'].values[0], 72.) + + def test_prior_no_tandem_1000_window_no_bed(self): + prior.MotifScorer.set_information_criteria(min_binding_ic=8, max_dist=0) + motif_peaks, (prior_edges, prior_matrix, raw_matrix) = self.do_scan_prior(1000, use_bed=False) + self.assertEqual(motif_peaks.shape[0], 14) + self.assertEqual(prior_edges.shape[0], 1) + self.assertEqual(prior_edges['score'].values[0], 24.) + + def test_multiple_genes_50_tandem_100_window(self): + prior.MotifScorer.set_information_criteria(min_binding_ic=8, max_dist=50) + print(self.genes) + self.genes = pd.concat((self.genes, pd.DataFrame({"seqname": "seq1", + "start": 550., + "end": 750., + "TSS": 750., + "gene_name": "TEST2", + "strand": "-"}, index=[1]))) + + motif_peaks, (prior_edges, prior_matrix, raw_matrix) = self.do_scan_prior(100) + self.assertEqual(motif_peaks.shape[0], 10) + self.assertEqual(prior_edges.shape[0], 2) + self.assertListEqual(prior_edges['score'].values.tolist(), [72., 24.]) + + def do_scan_prior(self, window_size, do_threshold=False, use_bed=True, use_tss=True): + genes = gtf.open_window(self.genes, window_size=window_size, use_tss=use_tss, + check_against_fasta=FASTA_FILE_NAME) + self.gene_locs = genes.loc[:, [gtf.GTF_CHROMOSOME, gtf.SEQ_START, gtf.SEQ_STOP, gtf.GTF_STRAND]].copy() + self.gene_locs[[gtf.SEQ_START, gtf.SEQ_STOP]] = self.gene_locs[[gtf.SEQ_START, gtf.SEQ_STOP]].astype(int) + + ms = MotifScan.scanner(motifs=self.motifs, num_workers=1) + motif_peaks = ms.scan(FASTA_FILE_NAME, + atac_bed_file=BED_FILE_NAME if use_bed else None, + promoter_bed=self.gene_locs, + min_ic=0, threshold=5e-4) + + return (motif_peaks, prior.build_prior_from_motifs(genes, motif_peaks, + self.motif_information, + num_workers=1, + do_threshold=do_threshold)) From 231b4eadf483917d022d50194d12a569e6595b83 Mon Sep 17 00:00:00 2001 From: asistradition Date: Sun, 4 Oct 2020 13:00:13 -0400 Subject: [PATCH 18/20] Fix 0 index error --- inferelator_prior/processor/gtf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inferelator_prior/processor/gtf.py b/inferelator_prior/processor/gtf.py index 81c3657..b2b6797 100644 --- a/inferelator_prior/processor/gtf.py +++ b/inferelator_prior/processor/gtf.py @@ -76,7 +76,7 @@ def open_window(annotation_dataframe, window_size, use_tss=False, check_against_ window_annotate.loc[window_annotate[GTF_STRAND] == "-", SEQ_START] = window_annotate[SEQ_START] - w_down window_annotate.loc[window_annotate[GTF_STRAND] == "-", SEQ_STOP] = window_annotate[SEQ_STOP] + w_up - window_annotate.loc[window_annotate[SEQ_START] < 0, SEQ_START] = 0 + window_annotate.loc[window_annotate[SEQ_START] < 1, SEQ_START] = 1 if check_against_fasta is not None: fasta_len = {} From c7ec2b65b2f80e165b9b841bf4d8d64a0e7b4656 Mon Sep 17 00:00:00 2001 From: asistradition Date: Sun, 4 Oct 2020 13:00:26 -0400 Subject: [PATCH 19/20] Fix several bugs with scoring --- inferelator_prior/processor/prior.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/inferelator_prior/processor/prior.py b/inferelator_prior/processor/prior.py index c143ba4..18e0905 100644 --- a/inferelator_prior/processor/prior.py +++ b/inferelator_prior/processor/prior.py @@ -96,7 +96,7 @@ def score_tf(cls, tf_motifs): # If there's more than two sites do the complicated tandem checking stuff else: # Find things that are in tandems - consider_tandem = (tf_motifs[MotifScan.stop_col] - tf_motifs[MotifScan.start_col].shift(1)) + consider_tandem = (tf_motifs[MotifScan.start_col] - tf_motifs[MotifScan.stop_col].shift(1)) consider_tandem = consider_tandem <= cls.max_dist # Skip the rest if nothing is close enough to matter @@ -155,8 +155,8 @@ def _first_value(series): except AttributeError: return series - @classmethod - def _agg_per_base(cls, overlap_df): + @staticmethod + def _agg_per_base(overlap_df): """ Aggregate an overlapping set of motif peaks by summing the maximum per-base IC for each base :param overlap_df: @@ -167,17 +167,21 @@ def _agg_per_base(cls, overlap_df): overlap_df.reset_index(inplace=True) - new_df = [(a, b) for i in overlap_df.index for a, b in zip(range(overlap_df.loc[i, MotifScan.start_col], - overlap_df.loc[i, MotifScan.stop_col]), - overlap_df.loc[i, SCORE_PER_BASE])] + # Melt the per-base information contents for each matching motif into a new dataframe + # Base number ["B"] and float score ["S"] + new_df = pd.DataFrame([(a, b) for i in overlap_df.index + for a, b in zip(range(overlap_df.loc[i, MotifScan.start_col], + overlap_df.loc[i, MotifScan.stop_col] + 1), + overlap_df.loc[i, SCORE_PER_BASE])], columns=["B", "S"]) + # Return a new dataframe with the maximum per-base scores aggregated return pd.DataFrame({MotifScan.start_col: [overlap_df[MotifScan.start_col].min()], MotifScan.stop_col: [overlap_df[MotifScan.stop_col].max()], - SCAN_SCORE_COL: pd.DataFrame(new_df, columns=["B", "S"]).groupby("B").agg('max').sum(), + SCAN_SCORE_COL: new_df.groupby("B").agg('max').sum(), MOTIF_NAME_COL: [overlap_df[MOTIF_NAME_COL].unique()[0]]}) -def build_prior_from_atac_motifs(genes, motif_peaks, motif_information, num_workers=1, seed=42): +def build_prior_from_motifs(genes, motif_peaks, motif_information, num_workers=1, seed=42, do_threshold=True): """ Construct a prior [G x K] interaction matrix :param genes: pd.DataFrame [G x n] @@ -234,8 +238,10 @@ def _prior_mapper(data): prior_matrix = raw_matrix.copy() # Threshold per-TF using DBSCAN - for reg in prior_matrix.columns: - prior_matrix.loc[~_find_outliers_dbscan(prior_matrix[reg]), reg] = 0. + + if do_threshold: + for reg in prior_matrix.columns: + prior_matrix.loc[~_find_outliers_dbscan(prior_matrix[reg]), reg] = 0. # Keep the peaks that we want thresholded_data = prior_matrix.reset_index().melt(id_vars=PRIOR_GENE, var_name=PRIOR_TF, value_name='T') From dd34c30acfd35cd8a056ca437a150c595b65edde Mon Sep 17 00:00:00 2001 From: asistradition Date: Sun, 4 Oct 2020 13:08:23 -0400 Subject: [PATCH 20/20] Update README and setup --- README.md | 27 +++++++++++++++++++++------ setup.py | 10 +++++----- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index b2a449c..190a176 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,29 @@ -# srrTomat0 - -[![Travis](https://travis-ci.org/cskokgibbs/srrTomat0.svg?branch=master)](https://travis-ci.org/cskokgibbs/srrTomat0) +# inferelator-prior This is a set of pipelines to create expression and prior matrices for network inference. They are designed to create -data that is compatible with the [inferelator](https://github.com/flatironinstitute/inferelator) package. In addition to +data that is compatible with the [inferelator](https://github.com/flatironinstitute/inferelator) package. + +### Usage + + python -m inferelator_prior.network_from_motifs + usage: network_from_motifs.py -m motif_PWM_file.meme + -f genome_fasta_file.fasta + -g genome_annotation_file.gtf + -o ~/output/path/prefix + --species {yeast,fly,mouse,human}] + +This requires a motif PWM database (`-m PATH`), +a genome to search (both sequence as a FASTA `-f PATH` and annotations `-g PATH`), +and an output prefix for several files (`-o PATH`). +In addition, default settings for a specific species can be set with (`--species`). + +### Requirements + +In addition to python dependencies, this package also requires [STAR](https://github.com/alexdobin/STAR), [sra-tools](http://ncbi.github.io/sra-tools/), [bedtools](https://bedtools.readthedocs.io/en/latest/), -[samtools](http://www.htslib.org/), -[homer](http://homer.ucsd.edu/homer/), and +[samtools](http://www.htslib.org/), and [fimo](http://meme-suite.org/doc/fimo.html). diff --git a/setup.py b/setup.py index 2d441f9..0be4204 100644 --- a/setup.py +++ b/setup.py @@ -3,24 +3,24 @@ install_requires = ["numpy", "pandas", "HTSeq", "pybedtools", "scipy", "pathos"] tests_require = ["coverage", "nose", "pysam"] -version = "0.1.1" +version = "0.2.0" # Description from README.md base_dir = os.path.dirname(os.path.abspath(__file__)) long_description = "\n\n".join([open(os.path.join(base_dir, "README.md"), "r").read()]) setup( - name="srrTomat0", + name="inferelator_prior", version=version, - description="SRR Pipelines: Building matrixes from read data", + description="Inferelator-Prior Network Generation Tool", long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/flatironinstitute/srrTomat0", + url="https://github.com/flatironinstitute/inferelator-prior", author="Chris Jackson", author_email="cj59@nyu.edu", maintainer="Chris Jackson", maintainer_email="cj59@nyu.edu", - packages=find_packages(include=["srrTomat0", "srrTomat0.*"]), + packages=find_packages(include=["inferelator_prior", "inferelator_prior.*"]), zip_safe=False, install_requires=install_requires, tests_require=tests_require,