diff --git a/.github/workflows/piranha.yml b/.github/workflows/piranha.yml index 1b06669..6a137d3 100644 --- a/.github/workflows/piranha.yml +++ b/.github/workflows/piranha.yml @@ -26,5 +26,5 @@ jobs: run: piranha --version - name: Run piranha with test data run: piranha -i piranha/test/pak_run/demultiplexed --verbose -b piranha/test/pak_run/barcodes01.csv -t 2 2>&1 | tee piranha.log - - name: Run piranha with all data - run: piranha -i piranha/test/pak_run/demultiplexed --verbose -b piranha/test/pak_run/barcodes.csv -t 2 2>&1 | tee piranha_all.log + - name: Run piranha in phylo mode + run: piranha -i piranha/test/pak_run/demultiplexed --verbose -b piranha/test/pak_run/barcodes.csv -t 2 -rp -ud -sd piranha/test/supp_data 2>&1 | tee piranha_phylo.log diff --git a/README.md b/README.md index fdcb3e2..264825b 100644 --- a/README.md +++ b/README.md @@ -367,17 +367,21 @@ and piranha will check which ones you have installed with your version of medaka ## Optional phylogenetics module **\*NEW FEATURE\*** -Piranha allows the user to optionally run a phylogenetics module in addition to variant calling and consensus builing. There are 3 additional dependencies needed if you wish to run this module: +Piranha allows the user to optionally run a phylogenetics module in addition to variant calling and consensus builing. If you have previously installed piranha, are 3 additional dependencies needed if you wish to run this module: - IQTREE2 - mafft - jclusterfunk +The latest environment file contains this dependencies, so to install you can just update your environment (`conda env update -f environment.yml`) or run using the latest image for piranha GUI. + This module will cluster any consensus sequences generated during the run into `reference_group`, so either `Sabin1-related`, `Sabin2-related`, `Sabin3-related` or `WPV1` and will ultimately build one maximum-likelihood phylogeny for each reference group with consensus sequences in a given sequencing run. To annotate the phylogeny with certain metadata from the barcodes.csv file, specify columns to include with `-pcol/--phylo-metadata-columns`. Piranha then extracts any relevant reference sequences from the installed reference file (identified by having `display_name=Sabin1-related` in their sequence header, or whichever reference group the relevant phylogeny will be for). -An optional file of local sequences can be supplied to supplement the phylogenetic analysis with `-ss/--supplementary-sequences`. This file should be in FASTA format, but does not need to be aligned. To allow piranha to assign the sequences to the relevant phylogeny, this file should have the reference group annotated in the header in the format `display_name=Sabin1-related`, for example. +An optional set of local sequences can be supplied to supplement the phylogenetic analysis. To supply them to piranha, point to the correct directory using `-sd,--supplementary-datadir`. The sequence files should be in FASTA format, but do not need to be aligned. To allow piranha to assign the sequences to the relevant phylogeny, the sequence files should have the reference group annotated in the header in the format `display_name=Sabin1-related`, for example. + +This supplementary sequence files can be accompanied with csv metadata files (one row per supplementary sequence) and this metadata can be included in the final report and annotated onto the phylogenies (`-smcol/--supplementary-metadata-columns`). By default, the metadata is matched to the FASTA sequence name with a column titled `sequence_name` but this header name can be configured by specifying `-smid/--supplementary-metadata-id-column`. -This supplementary sequence file can be accompanied with a csv metadata file (one row per supplementary sequence) (`-sm/--supplementary-metadata`) and this metadata can be included in the final report and annotated onto the phylogenies (`-smcol/--supplementary-metadata-columns`). By default, the metadata is matched to the FASTA sequence name with a column titled `sequence_name` but this header name can be configured by specifying `-smid/--supplementary-metadata-id-column` +Piranha will iterate accross the directory supplied and amalgamate the FASTA files, retaining any sequences with `display_name=X` in the header description, where X can be one of `Sabin1-related`, `Sabin2-related`, `Sabin3-related` or `WPV1`. It then will read in every csv file it detects in this directory and attempts to match any metadata to the gathered fasta records. These will be added to the relevant phylogenies. The phylogenetic pipeline is activated by running with the flag `-rp/--run-phylo`, which then triggers the following analysis steps: - Amalgamate the newly generated consensus sequences for all barcodes into their respective reference groups. @@ -391,6 +395,11 @@ The phylogenetic pipeline is activated by running with the flag `-rp/--run-phylo - Annotate the tree newick files with the specified metadata (Default: just whether it's a new consensus sequence or not). - Extract phylogenetic trees and embed in interactive report. +## Update local database **\*NEW FEATURE\*** + +If you supply a path to the `-sd,--supplementary-datadir` for the phylogenetics module, you have the option of updating this data directory with the new consesnsus sequences generated during the piranaha analysis. If you run with the `-ud,--update-local-database` flag, piranha will write out the new sequences and any accompanying metadata supplied into the directory provided. + + ## Output options By default the output directory will be created in the current working directory and will be named `analysis-YYYY-MM-DD`, where YYYY-MM-DD is today's date. This output can be configured in a number of ways. For example, the prefix `analysis` can be overwritten by using the `-pre/--output-prefix new_prefix` flag (or `output_prefix: new_prefix` in a config file) and this will change the default behaviour to `new_prefix_YYYY-MM-DD`. It's good practice not to include spaces or special characters in your directory names. diff --git a/environment.yml b/environment.yml index bfde79c..ee407df 100644 --- a/environment.yml +++ b/environment.yml @@ -5,13 +5,12 @@ channels: - defaults dependencies: - python=3.10 + - bcftools=1.11 - coreutils=9.1 + - iqtree>=2.1 + - cov-ert::jclusterfunk>=0.0.25 - mafft - minimap2=2.17 - samtools=1.11 - - bcftools=1.11 - - tabix=1.11 - snakemake-minimal - - iqtree>=2.1 - - mafft - - cov-ert::jclusterfunk>=0.0.25 + - tabix=1.11 diff --git a/piranha/analysis/phylo_functions.py b/piranha/analysis/phylo_functions.py index e3e3aed..f70524e 100644 --- a/piranha/analysis/phylo_functions.py +++ b/piranha/analysis/phylo_functions.py @@ -14,7 +14,7 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence seq_clusters = collections.defaultdict(list) header = VALUE_PHYLO_HEADER - + written = {} for record in SeqIO.parse(sample_seqs,KEY_FASTA): for ref_group in config[KEY_REFERENCES_FOR_CNS]: @@ -26,8 +26,8 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence """ fields = record.description.split(" ") - record_id = fields[0] - record_sample,reference_group,cns_id,epid,sample_date = record_id.split("|") + + record_sample,reference_group,cns_id,epid,sample_date = record.id.split("|") description_dict = {} for field in fields[1:]: @@ -37,14 +37,12 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence if ref_group == reference_group: new_record = record + new_record.description = "" + new_record.id = record.id barcode = description_dict[KEY_BARCODE] - name = record_id + name = new_record.id - new_record.description = name - new_record.id = name - - seq_clusters[ref_group].append(new_record) seq_metadata[name][KEY_NAME] = name seq_metadata[name][KEY_SAMPLE] = record_sample @@ -66,6 +64,8 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence call = "Sabin-like" seq_metadata[name][KEY_CALL] = call + seq_clusters[ref_group].append(new_record) + continue print(green("Reference groups for phylo pipeline:")) @@ -74,15 +74,20 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence if supplementary_sequences: for record in SeqIO.parse(supplementary_sequences,KEY_FASTA): - for ref_group in seq_clusters: - if ref_group in record.description: - seq_clusters[ref_group].append(record) - - seq_metadata[record.id][KEY_NAME] = record.id - seq_metadata[record.id][KEY_SAMPLE] = record.id - seq_metadata[record.id][KEY_SOURCE] = "Background" - seq_metadata[record.id][KEY_REFERENCE_GROUP] = ref_group - seq_metadata[record.id][KEY_CALL] = ref_group + if record: + for ref_group in seq_clusters: + if ref_group in record.description: + + + seq_metadata[record.id][KEY_NAME] = record.id + seq_metadata[record.id][KEY_SAMPLE] = record.id + seq_metadata[record.id][KEY_SOURCE] = "Background" + seq_metadata[record.id][KEY_REFERENCE_GROUP] = ref_group + seq_metadata[record.id][KEY_CALL] = ref_group + + new_record = record + new_record.description = "" + seq_clusters[ref_group].append(new_record) if supplementary_metadata: with open(supplementary_metadata, "r") as f: @@ -99,27 +104,29 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence seq_metadata[sample][col] = row[col] for record in SeqIO.parse(reference_sequences, KEY_FASTA): - for ref_group in seq_clusters: - if ref_group in record.description: - seq_clusters[ref_group].append(record) + if record: + for ref_group in seq_clusters: + if ref_group in record.description: + seq_clusters[ref_group].append(record) - seq_metadata[record.id][KEY_NAME] = record.id - seq_metadata[record.id][KEY_SAMPLE] = record.id - - seq_metadata[record.id][KEY_REFERENCE_GROUP] = ref_group - seq_metadata[record.id][KEY_CALL] = ref_group + seq_metadata[record.id][KEY_NAME] = record.id + seq_metadata[record.id][KEY_SAMPLE] = record.id + + seq_metadata[record.id][KEY_REFERENCE_GROUP] = ref_group + seq_metadata[record.id][KEY_CALL] = ref_group - if "Sabin" in record.description: - seq_metadata[record.id][KEY_SOURCE] = "Sabin" - else: - seq_metadata[record.id][KEY_SOURCE] = "Reference" + if "Sabin" in record.description: + seq_metadata[record.id][KEY_SOURCE] = "Sabin" + else: + seq_metadata[record.id][KEY_SOURCE] = "Reference" for record in SeqIO.parse(outgroup_sequences, KEY_FASTA): for ref_group in seq_clusters: if ref_group in record.description: new_record = record + new_record.description = "" new_record.id = "outgroup" - new_record.description = "outgroup" + seq_clusters[ref_group].append(new_record) with open(barcodes_csv, "r") as f: @@ -162,7 +169,12 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence writer0.writerow(row) with open(os.path.join(phylo_outdir, f"{i}.fasta"),"w") as fw: - SeqIO.write(seq_clusters[i], fw, KEY_FASTA) + records = seq_clusters[i] + record_dict = {} + for record in records: + record_dict[record.id] = record + unique_records = [r for r in record_dict.values()] + SeqIO.write(unique_records, fw, KEY_FASTA) tree_annotations = config[KEY_TREE_ANNOTATIONS] for i in header: @@ -172,22 +184,49 @@ def get_seqs_and_clusters(sample_seqs,supplementary_sequences,reference_sequence return list(seq_clusters.keys()),tree_annotations -def update_local_database(supplementary_sequences,sample_sequences,output_file): - with open(output_file,"w") as fw: - countall = 0 +def update_local_database(sample_sequences,detailed_csv,new_db_seqs,new_db_metadata,config): + + record_ids = {} + with open(new_db_seqs,"w") as fw: countnew = 0 - for record in SeqIO.parse(supplementary_sequences, "fasta"): - SeqIO.write(record, fw, "fasta") - countall+=1 for record in SeqIO.parse(sample_sequences, "fasta"): new_record = record desc_list = new_record.description.split(" ") - new_desc_list = [i for i in desc_list if not i.startswith("barcode=")] - new_record.description = " ".join(new_desc_list) - SeqIO.write(new_record, fw, "fasta") - countall+=1 - countnew+=1 + write_record = True + + for i in desc_list: + if i.startswith("variant_count"): + count = int(i.split("=")[1]) + if count < 6: + write_record = False + + if write_record: + new_desc_list = [i for i in desc_list if not i.startswith("barcode=")] + new_record.description = " ".join(new_desc_list) + + SeqIO.write(new_record, fw, "fasta") + countnew+=1 + sample = record.id.split("|")[0] + record_ids[record.id] = sample + + with open(new_db_metadata,"w") as fw: + with open(detailed_csv,"r") as f: + reader = csv.DictReader(f) + header = reader.fieldnames + header.append(config[KEY_SUPPLEMENTARY_METADATA_ID_COLUMN]) + + writer = csv.DictWriter(fw, fieldnames=header, lineterminator="\n") + writer.writeheader() + sample_data = {} + for row in reader: + sample = row[KEY_SAMPLE] + sample_data[sample] = row + + for record_id in record_ids: + sample = record_ids[record_id] + row = sample_data[sample] + row[config[KEY_SUPPLEMENTARY_METADATA_ID_COLUMN]] = record_id + writer.writerow(row) print(green(f"Local database updated with ")+ f"{countnew}"+ green(" newly generated records.")) - print(green(f"Total records in local database:"), countall) diff --git a/piranha/analysis/stool_functions.py b/piranha/analysis/stool_functions.py index 476bdfc..1025998 100644 --- a/piranha/analysis/stool_functions.py +++ b/piranha/analysis/stool_functions.py @@ -33,58 +33,59 @@ def gather_fasta_files(summary_info, barcodes_csv, input_cns_list,all_metdata,ru cns_counter = collections.Counter() for cns_file in input_cns_list: for record in SeqIO.parse(cns_file, KEY_FASTA): - cns_info= record.description.split(" ") - ref,barcode,var_count,var_string=cns_info[0].split("|") - - info = [] - for row in analysis_info[barcode]: - if row[KEY_REFERENCE] == ref: - info = row + if record: + cns_info= record.description.split(" ") + ref,barcode,var_count,var_string=cns_info[0].split("|") + + info = [] + for row in analysis_info[barcode]: + if row[KEY_REFERENCE] == ref: + info = row - metadata = input_metadata[barcode] + metadata = input_metadata[barcode] - record_id = f"{metadata[KEY_SAMPLE]}|{info[KEY_REFERENCE_GROUP]}" - cns_counter[record_id] += 1 + record_id = f"{metadata[KEY_SAMPLE]}|{info[KEY_REFERENCE_GROUP]}" + cns_counter[record_id] += 1 - record_id += f"|CNS{cns_counter[record_id]}" + record_id += f"|CNS{cns_counter[record_id]}" - if KEY_EPID in metadata: - record_id += f"|{metadata[KEY_EPID]}" - else: - record_id += "|" + if KEY_EPID in metadata: + record_id += f"|{metadata[KEY_EPID]}" + else: + record_id += "|" - if KEY_DATE in metadata: - record_id += f"|{metadata[KEY_DATE]}" - else: - record_id += "|" + if KEY_DATE in metadata: + record_id += f"|{metadata[KEY_DATE]}" + else: + record_id += "|" - record_id += f" {KEY_BARCODE}={barcode}" - record_id += f" {KEY_REFERENCE}={ref}" - record_id += f" {KEY_REFERENCE_MATCH_FIELD}={info[KEY_REFERENCE_GROUP]}" + record_id += f" {KEY_BARCODE}={barcode}" + record_id += f" {KEY_REFERENCE}={ref}" + record_id += f" {KEY_REFERENCE_MATCH_FIELD}={info[KEY_REFERENCE_GROUP]}" - if runname: - record_id += f" {KEY_RUNNAME}={runname}" + if runname: + record_id += f" {KEY_RUNNAME}={runname}" - if "Sabin" in ref: - record_id += f" {KEY_VARIANT_COUNT}={var_count}" - record_id += f" {KEY_VARIANTS}={var_string}" - else: - record_id += f" {KEY_VARIANT_COUNT}=NA" - record_id += f" {KEY_VARIANTS}=NA" + if "Sabin" in ref: + record_id += f" {KEY_VARIANT_COUNT}={var_count}" + record_id += f" {KEY_VARIANTS}={var_string}" + else: + record_id += f" {KEY_VARIANT_COUNT}=NA" + record_id += f" {KEY_VARIANTS}=NA" - if all_metdata: - - for col in metadata: - if col != KEY_SAMPLE and col != KEY_BARCODE: - record_id += f" {col}={metadata[col]}" - """ - record header is: - >SAMPLE|REFERENCE_GROUP|CNS_ID|EPID|DATE barcode=barcode01 variant_count=8 variants=17:CT;161:CT;427:GA;497:AC;507:CT;772:AG;822:CT;870:CA + if all_metdata: + + for col in metadata: + if col != KEY_SAMPLE and col != KEY_BARCODE: + record_id += f" {col}={metadata[col]}" + """ + record header is: + >SAMPLE|REFERENCE_GROUP|CNS_ID|EPID|DATE barcode=barcode01 variant_count=8 variants=17:CT;161:CT;427:GA;497:AC;507:CT;772:AG;822:CT;870:CA - if "all_metadata" then everything else gets added to the description - """ - fw.write(f">{record_id}\n{record.seq}\n") - handle_dict[barcode].write(f">{record_id}\n{record.seq}\n") + if "all_metadata" then everything else gets added to the description + """ + fw.write(f">{record_id}\n{record.seq}\n") + handle_dict[barcode].write(f">{record_id}\n{record.seq}\n") for handle in handle_dict: handle_dict[handle].close() diff --git a/piranha/command.py b/piranha/command.py index cf05d56..24d9d9b 100644 --- a/piranha/command.py +++ b/piranha/command.py @@ -55,12 +55,11 @@ def main(sysargs = sys.argv[1:]): phylo_group = parser.add_argument_group('Phylogenetics options') phylo_group.add_argument("-rp","--run-phylo",action="store_true",help=f"Trigger the optional phylogenetics module. Additional dependencies may need to be installed.") - phylo_group.add_argument("-ss","--supplementary-sequences",action="store",help=f"Supplementary sequence FASTA file to be incorporated into phylogenetic analysis.") - phylo_group.add_argument("-sm","--supplementary-metadata",action="store",help=f"Supplementary metadata file associated with the supplementary sequence FASTA file.") + phylo_group.add_argument("-sd","--supplementary-datadir",action="store",help=f"Path to directory containing supplementary sequence FASTA file and optional metadata to be incorporated into phylogenetic analysis.") phylo_group.add_argument("-pcol","--phylo-metadata-columns",action="store",help=f"Columns in the barcodes.csv file to annotate the phylogeny with. Default: {VALUE_PHYLO_METADATA_COLUMNS}") - phylo_group.add_argument("-smcol","--supplementary-metadata-columns",action="store",help=f"Columns in the supplementary metadata file to annotate the phylogeny with. Default: {VALUE_SUPPLEMENTARY_METADATA_COLUMNS}") - phylo_group.add_argument("-smid","--supplementary-metadata-id-column",action="store",help=f"Column in the supplementary metadata file to match with the supplementary sequences. Default: {VALUE_SUPPLEMENTARY_METADATA_ID_COLUMN}") - phylo_group.add_argument("-db","--update-local-database",action="store_true",help=f"Amalgamate newly generated consensus sequences with the supplied supplementary sequence FASTA file and write to file.") + phylo_group.add_argument("-smcol","--supplementary-metadata-columns",action="store",help=f"Columns in the supplementary metadata to annotate the phylogeny with. Default: {VALUE_SUPPLEMENTARY_METADATA_COLUMNS}") + phylo_group.add_argument("-smid","--supplementary-metadata-id-column",action="store",help=f"Column in the supplementary metadata files to match with the supplementary sequences. Default: {VALUE_SUPPLEMENTARY_METADATA_ID_COLUMN}") + phylo_group.add_argument("-ud","--update-local-database",action="store_true",help=f"Amalgamate newly generated consensus sequences with the supplied supplementary sequence FASTA file and write to file.") o_group = parser.add_argument_group('Output options') o_group.add_argument('-o','--outdir', action="store",help=f"Output directory. Default: `{VALUE_OUTPUT_PREFIX}-2022-XX-YY`") @@ -140,37 +139,37 @@ def main(sysargs = sys.argv[1:]): args.negative_control, config) + # sets up the output dir, temp dir, and data output desination + directory_setup.output_group_parsing(args.outdir, + args.output_prefix, + args.overwrite, + args.datestamp, + args.tempdir, + args.no_temp, + config) + + init.misc_args_to_config(args.verbose, + args.threads, + args.username, + args.institute, + args.runname, + config) # runs qc checks on the phylo input options and configures the phylo settings + # now need tempdir for this parsing, so run after directory_setup + # also needs runname to not add runname.today.fasta to the db input_qc.phylo_group_parsing(args.run_phylo, - args.supplementary_sequences, - args.supplementary_metadata, + args.update_local_database, + args.supplementary_datadir, args.phylo_metadata_columns, config[KEY_BARCODES_CSV], args.supplementary_metadata_columns, args.supplementary_metadata_id_column, - args.update_local_database, config) if config[KEY_RUN_PHYLO]: # checks the phylo-specific dependencies dependency_checks.check_dependencies(PHYLO_DEPENDENCY_LIST, PHYLO_MODULE_LIST) - # sets up the output dir, temp dir, and data output desination - directory_setup.output_group_parsing(args.outdir, - args.output_prefix, - args.overwrite, - args.datestamp, - args.tempdir, - args.no_temp, - config) - - init.misc_args_to_config(args.verbose, - args.threads, - args.username, - args.institute, - args.runname, - config) - # ready to run? either verbose snakemake or quiet mode init.set_up_verbosity(config) @@ -222,10 +221,6 @@ def main(sysargs = sys.argv[1:]): print(green("Initializing phylo pipeline.")) status = misc.run_snakemake(config,phylo_snakefile,config) - if config[KEY_UPDATE_LOCAL_DATABASE]: - output_db = os.path.join(config[KEY_OUTDIR],"published_data",f"updated_database.{config[KEY_TODAY]}.fasta") - phylo_functions.update_local_database(config[KEY_SUPPLEMENTARY_SEQUENCES],config[KEY_SAMPLE_SEQS],output_db) - # get the inputs for making the overall report report =os.path.join(config[KEY_OUTDIR],OUTPUT_REPORT) summary_csv=os.path.join(config[KEY_TEMPDIR],PREPROCESSING_SUMMARY) @@ -242,6 +237,11 @@ def main(sysargs = sys.argv[1:]): config[KEY_ANNOTATIONS], config) + if config[KEY_UPDATE_LOCAL_DATABASE]: + new_db_seqs = os.path.join(config[KEY_SUPPLEMENTARY_DATADIR],f"{config[KEY_RUNNAME]}.{config[KEY_TODAY]}.fasta") + new_db_metadata = os.path.join(config[KEY_SUPPLEMENTARY_DATADIR],f"{config[KEY_RUNNAME]}.{config[KEY_TODAY]}.csv") + phylo_functions.update_local_database(config[KEY_SAMPLE_SEQS],detailed_csv,new_db_seqs,new_db_metadata,config) + for r,d,f in os.walk(os.path.join(config[KEY_OUTDIR],"published_data")): for fn in f: if not os.path.getsize(os.path.join(r,fn)): diff --git a/piranha/input_parsing/input_qc.py b/piranha/input_parsing/input_qc.py index 080f40d..6cc427b 100644 --- a/piranha/input_parsing/input_qc.py +++ b/piranha/input_parsing/input_qc.py @@ -96,52 +96,42 @@ def parse_barcodes_csv(barcodes_csv,config): print(f"- {i}") sys.exit(-1) -def qc_supplementary_sequence_file(supplementary_sequences): - misc.check_path_exists(supplementary_sequences) - incorrect = 0 - total = 0 - seq_ids = set() - try: - for record in SeqIO.parse(supplementary_sequences,"fasta"): - seq_ids.add(record.id) - - total +=1 - passed = False - for field in record.description.split(" "): - if field.startswith(KEY_DISPLAY_NAME): - passed=True - continue - if not passed: - incorrect +=1 - except: - sys.stderr.write(cyan(f"Failed to parse supplementary sequence file, check it is in FASTA format.\n")) - sys.exit(-1) - - if incorrect >= 1: - sys.stderr.write(cyan(f"Supplementary sequences file lacks `{KEY_DISPLAY_NAME}` annotation in header of {incorrect} out of {total} sequences parsed.\n")) - sys.exit(-1) - else: - print(green("Supplementary sequences:"), total, "sequences parsed.") +# def qc_supplementary_sequence_file(supplementary_sequences): +# misc.check_path_exists(supplementary_sequences) +# incorrect = 0 +# total = 0 +# seq_ids = set() +# try: +# for record in SeqIO.parse(supplementary_sequences,"fasta"): +# seq_ids.add(record.id) + +# total +=1 +# passed = False +# for field in record.description.split(" "): +# if field.startswith(KEY_DISPLAY_NAME): +# passed=True +# continue +# if not passed: +# incorrect +=1 +# except: +# sys.stderr.write(cyan(f"Failed to parse supplementary sequence file, check it is in FASTA format.\n")) +# sys.exit(-1) + +# if incorrect >= 1: +# sys.stderr.write(cyan(f"Supplementary sequences file lacks `{KEY_DISPLAY_NAME}` annotation in header of {incorrect} out of {total} sequences parsed.\n")) +# sys.exit(-1) +# else: +# print(green("Supplementary sequences:"), total, "sequences parsed.") - return seq_ids - -def qc_supplementary_metadata_file(supplementary_metadata,seq_ids,config): - if not config[KEY_SUPPLEMENTARY_SEQUENCES]: - sys.stderr.write(cyan(f"Error: Supplementary metadata supplied without accompanying sequence file.\n")) - sys.exit(-1) +# return seq_ids - if not os.path.exists(supplementary_metadata): - sys.stderr.write(cyan(f"Error: Cannot find input file {supplementary_metadata}.\n")) - sys.exit(-1) +def qc_supplementary_metadata_file(supplementary_metadata,config): with open(supplementary_metadata,"r") as f: reader = csv.DictReader(f) - if config[KEY_SUPPLEMENTARY_METADATA_ID_COLUMN] not in reader.fieldnames: - sys.stderr.write(cyan(f"Error: {supplementary_metadata} missing id column `{config[KEY_SUPPLEMENTARY_METADATA_ID_COLUMN]}`.\n")) - sys.exit(-1) - missing = set() for col in config[KEY_SUPPLEMENTARY_METADATA_COLUMNS]: + if col not in reader.fieldnames and col not in VALUE_SUPPLEMENTARY_METADATA_COLUMNS: missing.add(col) if missing: @@ -150,41 +140,138 @@ def qc_supplementary_metadata_file(supplementary_metadata,seq_ids,config): sys.stderr.write(cyan(f"- {i}\n")) sys.exit(-1) - in_seq_file = set() - for row in reader: - if row[config[KEY_SUPPLEMENTARY_METADATA_ID_COLUMN]] in seq_ids: - in_seq_file.add(row[config[KEY_SUPPLEMENTARY_METADATA_ID_COLUMN]]) - - not_in_metadata = set() - for seq_id in seq_ids: - if seq_id not in in_seq_file: - not_in_metadata.add(seq_id) - - if not_in_metadata: - sys.stderr.write(cyan(f"Error: the following {len(not_in_metadata)} supplementary sequences do not have accompanying metadata:\n")) - for i in seq_id: - print(f"- {seq_id}") - sys.exit(-1) + +def parse_fasta_file(supplementary_datadir,supp_file,seq_records,no_reference_group,total_seqs,seq_info,config): + for record in SeqIO.parse(os.path.join(supplementary_datadir,supp_file),"fasta"): + total_seqs["total"] +=1 + ref_group = "" + for field in record.description.split(" "): + if field.startswith(KEY_DISPLAY_NAME): + ref_group = field.split("=")[1] + + if ref_group not in config[KEY_REFERENCES_FOR_CNS]: + no_reference_group.add(record.id) + else: + total_seqs[ref_group]+=1 + seq_records.append(record) + seq_info[record.id]= {} + +def check_there_are_seqs(total_seqs,supplementary_datadir,no_reference_group,config): + if total_seqs["total"]==0: + sys.stderr.write(cyan(f"Error: No sequence files matched in `{supplementary_datadir}`.\nEnsure the directory provided contains FASTA files with appropriate annotations in the header.\n")) + sys.stderr.write(cyan(f"Header must specify one of {config[KEY_REFERENCES_FOR_CNS]} under {KEY_DISPLAY_NAME}=X, where X is the appropriate group to be included in phylo pipeline.\n")) + sys.exit(-1) + + elif no_reference_group: + print(cyan(f"Warning: not all sequences in {supplementary_datadir} included in analysis because they lack annotations in the header description.")) + print(cyan("Number of seqs not included:"),f"{len(no_reference_group)}") + + print(green(f"Total supplementary sequences:"),total_seqs["total"]) + for reference_group in config[KEY_REFERENCES_FOR_CNS]: + print(green(f"Total {reference_group} sequences:"), total_seqs[reference_group]) + +def gather_supplementary_data(supplementary_datadir,supplementary_sequences,supplementary_metadata,supplementary_metadata_id_column,config): + + seq_records = [] + + no_reference_group = set() + total_seqs = collections.Counter() + + seq_info = collections.defaultdict(dict) + metadata_info = collections.defaultdict(dict) + passed_over_csvs = set() + with open(supplementary_sequences,"w") as fw: + for r,d,f in os.walk(supplementary_datadir): + for supp_file in f: + if supp_file.endswith(".fasta") or supp_file.endswith(".fa"): + + today_data = f"{config[KEY_RUNNAME]}.{config[KEY_TODAY]}.fasta" + if supp_file != today_data: + parse_fasta_file(supplementary_datadir,supp_file,seq_records,no_reference_group,total_seqs,seq_info,config) + + elif supp_file.endswith("csv"): + with open(os.path.join(supplementary_datadir,supp_file),"r") as f: + reader = csv.DictReader(f) + if supplementary_metadata_id_column not in reader.fieldnames: + passed_over_csvs.add(supp_file) + continue + else: + for row in reader: + seq_name = row[supplementary_metadata_id_column] + metadata_info[seq_name] = row + + check_there_are_seqs(total_seqs,supplementary_datadir,no_reference_group,config) + + SeqIO.write(seq_records,fw, "fasta") + + supplementary_metadata_header = set() + + for seq_id in seq_info: + if seq_id in metadata_info: + row = metadata_info[seq_id] + seq_info[seq_id] = row + for col in row: + supplementary_metadata_header.add(col) + + with open(supplementary_metadata,"w") as fw: + writer = csv.DictWriter(fw, fieldnames=supplementary_metadata_header,lineterminator="\n") + writer.writeheader() + for seq_id in seq_info: + row = seq_info[seq_id] + for field in supplementary_metadata_header: + if field not in row: + row[field] = "" + + writer.writerow(row) def phylo_group_parsing(run_phylo_arg, - supplementary_sequences_arg, - supplementary_metadata_arg, + update_local_database, + supplementary_datadir, phylo_metadata_columns_arg, barcodes_csv, supplementary_metadata_columns_arg, supplementary_metadata_id_column_arg, - update_local_database, config): misc.add_arg_to_config(KEY_RUN_PHYLO,run_phylo_arg,config) if config[KEY_RUN_PHYLO] not in [True, False]: - sys.stderr.write(cyan(f"`run_phylo` argument must be either True/False if specified through the config file.\n")) + sys.stderr.write(cyan(f"`{KEY_RUN_PHYLO}` argument must be either True/False if specified through the config file.\n")) sys.exit(-1) if config[KEY_RUN_PHYLO]: + misc.add_path_to_config(KEY_SUPPLEMENTARY_DATADIR,supplementary_datadir,config) + misc.check_path_exists(config[KEY_SUPPLEMENTARY_DATADIR]) + misc.add_arg_to_config(KEY_UPDATE_LOCAL_DATABASE,update_local_database,config) + if config[KEY_UPDATE_LOCAL_DATABASE] not in [True, False]: + sys.stderr.write(cyan(f"`{KEY_UPDATE_LOCAL_DATABASE}` argument must be either True/False if specified through the config file.\n")) + sys.exit(-1) + + if config[KEY_UPDATE_LOCAL_DATABASE] and not config[KEY_SUPPLEMENTARY_DATADIR]: + sys.stderr.write(cyan(f"Error: Cannot update local database with new sequences as no supplementary database has been provided.\n")) + sys.exit(-1) + + misc.add_arg_to_config(KEY_SUPPLEMENTARY_METADATA_ID_COLUMN,supplementary_metadata_id_column_arg,config) + + if config[KEY_SUPPLEMENTARY_DATADIR]: + db_dir = os.path.join(config[KEY_TEMPDIR],"local_db") + if not os.path.exists(db_dir): + os.mkdir(db_dir) + + config[KEY_SUPPLEMENTARY_SEQUENCES] = os.path.join(config[KEY_TEMPDIR],"local_db","supp_seqs.fasta") + config[KEY_SUPPLEMENTARY_METADATA] = os.path.join(config[KEY_TEMPDIR],"local_db","supp_metadata.csv") + + gather_supplementary_data(config[KEY_SUPPLEMENTARY_DATADIR],config[KEY_SUPPLEMENTARY_SEQUENCES],config[KEY_SUPPLEMENTARY_METADATA],config[KEY_SUPPLEMENTARY_METADATA_ID_COLUMN],config) + + misc.add_arg_to_config(KEY_SUPPLEMENTARY_METADATA_COLUMNS,supplementary_metadata_columns_arg,config) + if not type(config[KEY_SUPPLEMENTARY_METADATA_COLUMNS])==list: + config[KEY_SUPPLEMENTARY_METADATA_COLUMNS] = [config[KEY_SUPPLEMENTARY_METADATA_COLUMNS]] + + if config[KEY_SUPPLEMENTARY_METADATA]: + qc_supplementary_metadata_file(config[KEY_SUPPLEMENTARY_METADATA], + config) misc.add_arg_to_config(KEY_PHYLO_METADATA_COLUMNS,phylo_metadata_columns_arg,config) with open(barcodes_csv,"r") as f: @@ -193,38 +280,13 @@ def phylo_group_parsing(run_phylo_arg, for col in config[KEY_PHYLO_METADATA_COLUMNS]: if col not in reader.fieldnames and col not in VALUE_PHYLO_METADATA_COLUMNS: missing.add(col) + if missing: sys.stderr.write(cyan(f"The following {KEY_PHYLO_METADATA_COLUMNS} columns are missing from the barcodes.csv file:\n")) for i in missing: sys.stderr.write(cyan(f"- {i}\n")) sys.exit(-1) - if supplementary_sequences_arg: - misc.add_file_to_config(KEY_SUPPLEMENTARY_SEQUENCES,supplementary_sequences_arg,config) - - seq_ids = set() - - if config[KEY_SUPPLEMENTARY_SEQUENCES]: - seq_ids = qc_supplementary_sequence_file(config[KEY_SUPPLEMENTARY_SEQUENCES]) - else: - print(cyan("Note: no supplementary sequence file provided.")) - - if config[KEY_UPDATE_LOCAL_DATABASE]: - sys.stderr.write(cyan(f"Error: Cannot update local database with new sequences as no supplementary sequences have been provided.\n")) - sys.exit(-1) - - misc.add_file_to_config(KEY_SUPPLEMENTARY_METADATA,supplementary_metadata_arg,config) - - misc.add_arg_to_config(KEY_SUPPLEMENTARY_METADATA_COLUMNS,supplementary_metadata_columns_arg,config) - if not type(config[KEY_SUPPLEMENTARY_METADATA_COLUMNS])==list: - config[KEY_SUPPLEMENTARY_METADATA_COLUMNS] = [config[KEY_SUPPLEMENTARY_METADATA_COLUMNS]] - misc.add_arg_to_config(KEY_SUPPLEMENTARY_METADATA_ID_COLUMN,supplementary_metadata_id_column_arg,config) - - if config[KEY_SUPPLEMENTARY_METADATA]: - qc_supplementary_metadata_file(config[KEY_SUPPLEMENTARY_METADATA], - seq_ids, - config) - def parse_read_dir(readdir,config): if readdir: diff --git a/piranha/test/supp_metadata.csv b/piranha/test/supp_data/supp_metadata.csv similarity index 100% rename from piranha/test/supp_metadata.csv rename to piranha/test/supp_data/supp_metadata.csv diff --git a/piranha/test/supp_seqs.fasta b/piranha/test/supp_data/supp_seqs.fasta similarity index 50% rename from piranha/test/supp_seqs.fasta rename to piranha/test/supp_data/supp_seqs.fasta index d70d764..be901ae 100644 --- a/piranha/test/supp_seqs.fasta +++ b/piranha/test/supp_data/supp_seqs.fasta @@ -4,9 +4,3 @@ GGAATTGGTGACATGATTGAGGGGGCTGTTGAAGGAATTACTAAAAATGCATTAGCTNCCCCGACTTCCACCAATAGCCT NGGATTGGTGACATGATTGAGGGGGCTGTTGAAGGAATTACTAAAAATGCATTAGCTNCCCCGACTTCCACCAATAGCCTGCCTAACACAGAGCCGAGCGGTCCAGCCCACTCCAAGGAGATACCTGCATTGACAGCCGTGGAGACAGGGGCCACCAATCCGTTGGTGCCTTCGGACACCGTGCAAACGCGCCATGTCATTCAGAGACGGACGCGATCAGAGTCCACGGTTGAGTCGTTCTTTGCAAGAGGGGCTTGCGTGGCTATCATTGAAGTGGACAATGATGCGCCAACGAAGCGCGCCAGCAGATTGTTTTCGGTTTGGAAAATAACTTACAAAGATACTGTTCAGCTGAGACGCAAACTGGAATTTTTCACATATTCGAGATTTGACATGGAGTTCACTTTTGTGGTCACCTCAAACTACACTGATGCAAATAACGGACATGCATTGAACCAGGTTTATCAGATAATGTATATACCACCCGGAGCACCTATCCCTGGCAAATGGAATGACTATACGTGGCAGACGTCCTCCAACCCGTCGGTGTTTTACACCTATGGGGCGCCCCCAGCAAGAATATCAGTGCCCTACGTGGGAATTGCTAATGCGTATTCCCACTTTTATGATGGGTTCGCAAAGGTACCACTAGCGGGTCAAACCTCAACTGAAGGCGATTCGTTGTACGGTGCTGCCTCATTGAATGATTTTGGATCACTGGCTGTTCGCGTGGTAAATGATCATAACCCCACGCGGCTCACCTCCAAGATCAGAGTGTACATGAAGCCAAAGCATGTCAGAGTCTGGTGCCCACGACCTCCACGAGTAGTTCCATACTTCGGACCAGGTGTTGATTATAAAGATGGGCTCACCCCACTACCAGAAAAGGGATTAACGACTTAT >1177|VDPV display_name=Sabin3-related GGTATTGAAGATTTGATCCCTGAAGTTGCACAGGGCGCCCTAACTTTGTCACTCCCGAAGCAACAGGATAGCTTACCTGATACTAAGGCCAGTGGCCCGGCGCATTCCAAGGAGGTACCTGCACTCACTGCAGTCGAGACTGGAGCCACCAATCCTCTGGTACCATCCGACACAGTTCAAACGCGCCACGTAGTCCAACGACGCAGCAGGTCAGAGTCCACAATAGAATCATTCTTCGCACGCGGGGCGTGCGTCGCTATTATTGAGGTGGACAATGAACAACCAACCACCCGGGCACAGAAACTATTTGCCATGTGGCGCATTACATACAAAGATACAGTGCAGTTGCGCCGTAAGTTGGAGTTTTTCACATACTCTCGTTTTGACATGGAATTCACCTTCGTGGTAACCGCCAACTTCACCAACACTAATAATGGGCATGCACTCAACCAGGTGTACCAGTTAATGTACATNCCCCCAGGGGCACCCACACCAACGTCATGGGATGACTACACTTGGCAAACATCTTCCAACCCGTCCATATTTTACACCTATGGGGCTGCCCCGGCGCGAATCTCAGTGCCATACGTGGGGTTAGCCAATGCTTACTCGCACTTTTACGACGGCTTCGCCAAGGTGCCATTGAAGACAGATGCCAATGACCAGATTGGTGATTCCTTGTACAGCGCCATGACAGTTGATGACTTTGGTGTATTGGCAGTTCCGTTGTCAATGATCACAACCCCACTAAAGTAACCTCCAAAGTCCGCGTTTACATGAAACCCAAACACGTACGTGTCTGGTGCCCTAGACCGCCGCGTGCGGTTCCTTATTATGGACCAGGGGTGGACTATAGGTACAACTTGGAACCCTTATCTGAGAAAGGTTTGACCACATAT ->1178|VDPV display_name=Sabin3-related -GGTATTGAAGATTTGATCCCTGAAGTTGCACAGGGCGCCCTAACTTTGTCACTCCCGAAGCAACAGGATAGCTTACGAGATACTAAGGCCAGTGGCCCGGGGCATTCCAAGGAGGTACCTGCACTCACTGCAGTCGAGACTGGAGCCACCAATCCTCTGGTACCATCCGACACAGTTCAAACGCGCCACGTAGTCCAACGACGCAGCAGGTCAGAGTCCACAATAGAATCATTCTTCGCACGCGGGGCGTGCGTCGCTATTATTGAGGTGGACAATGAACAACCAACCACCCGGGCACAGAAACTATTTGCCATGTGGCGCATTACATACAAAGATACAGTGCAGTTGCGCCGTAAGTTGGAGTTTTTCACATACTCTCGTTTTGACATGGAATTCACCTTCGTGGTAACCGCCAACTTCACCAACACTAATAATGGGCATGCACTCAACCAGGTGTACCAGTTAATGTACATNCCCCCAGGGGCACCCACACCAACGTCATGGGATGACTACACTTGGCAAACATCTTCCAACCCGTCCATATTTTACACCTATGGGGCTGCCCCGGCGCGAATCTCAGTGCCATACGTGGGGTTAGCCAATGCTTACTCGCACTTTTACGACGGCTTCGCCAAGGTGCCATTGAAGACAGATGCCAATGACCAGATTGGTGATTCCTTGTACAGCGCCATGACAGTTGATGACTTTGGTGTATTGGCAGTTCCGTTGTCAATGATCACAACCCCACTAAAGTAACCTCCAAAGTCCGCGTTTACATGAAACCCAAACACGTACGTGTCTGGTGCCCTAGACCGCCGCGTGCGGTTCCTTATTATGGACCAGGGGTGGACTATAGGTACAACTTGGAACCCTTATCTGAGAAAGGTTTGACCACATAT ->1179|VDPV display_name=Sabin3-related -GGTATTGAAGATTTGATCCCTGAAGTTGCACAGGGCGCCTTAACTTTGTCACTCCCGAAGCAACAGGATAGCTTACCTGATACTAAGGCCAGTGGCCCGGCGCATTCCAAGGAGGTACCTGCACTCACTGCAGTCGAGACTGGAGCCACCAATCCTCTGGTACCATCCGACACAGTTCAAACGCGCCACGTAGTCCAACGACGCAGCAGGTCAGAGTCCACAATAGAATCATTCTTCGCACGCGGGGCGTGCGTCGCTATTATTGAGGTGGACAATGAACAACCAACCACCCGGGCACAGAAACTATTTGCCATGTGGCGCATTACATACAAAGATACAGTGCAGTTGCGCCGTAAGTTGGAGTTTTTCACATACTCTCGTTTTGACATGGAATTCACCTTCGTGGTAACCGCCAACTTCACCAACACTAATAATGGGCATGCACTCAACCAGGTGTACCAGTTAATGTACATNCCCCCAGGGGCACCCACACCAACGTCATGGGATGACTACACTTGGCAAACATCTTCCAACCCGTCCATATTTTACACCTATGGGGCTGCCCCGGCGCGAATCTCAGTGCCATACGTGGGGTTAGCCAATGCTTACTCGCACTTTTACGACGGCTTCGCCAAGGTGCCATTGAAGACAGATGCCAATGACCAGATTGGTGATTCCTTGTACAGCGCCATGACAGTTGATGACTTTGGTGTATTGGCAGTTCCGTTGTCAATGATCACAACCCCACTAAAGTAACCTCCAAAGTCCGCGTTTACATGAAACCCAAACACGTACGTGTCTGGTGCCCTAGACCGCCGCGTGCGGTTCCTTATTATGGACCAGGGGTGGACTATAGGTACAACTTGGAACCCTTATCTGAGAAAGGTTTGACCACATAT ->1180|VDPV display_name=Sabin2-related -GGAATTGGTGACATGATTGAGGGGGCCGTTGAAGGGATTACTAAAAATGCATTGGCTCCCCCGACTTCCACCAATAGCCTGCCCGACACGAAGCCGAGCGGTCCAGCCCATTCCAAGGAGATACCTGCATTGACAGCCGTGGAGACAGGGGCTACCAATCCGTTGGTGCCTTCGGACACCGTGCAAACGCGCCATGTCATCCAGAGACGAACGCGATCAGAGTCCACGGTTGAGTCATTCTTTGCAAGAGGGGCTTGTGTGGCTATCATTGAGGTGGACAATGATGCACCGACAAAGCGCGCCAGCAGATTGTTTTCGGTTTGGAAAATAACTTACAAAGATACTGTTCAACTGAGACGCAAACTGGAATTTTTCACATATTCGAGATTTGACATGGAGTTCACTTTTGTGGTCACCTCAAACTACACTGATGCAAATAACGGACATGCATTGAACCAAGTTTATCAGATAATGTATATACCACCCGGAGCACCTATCCCTGGTAAATGGAATGACTACACGTGGCAGACGTCCTCTAACCCGTCGGTGTTTTACACCTATGGGGCGCCCCCAGCAAGAATATCAGTGCCCTACGTGGGAATTGCTAATGCGTATTCCCACTTCTATGATGGGTTTGCAAAAGTACCACTAGCGGATCAAGCCTCAACTGAAGGCGATTCGTTGTACGGTGCTGCCTCACTGAATGATTTTGGATCACTGGCTGTTCGCGTGGTAAATGACCACAACCCCACGCGGCTCACCTCCAAGATCAGAGTGTACATGAAGCCAAAGCATGTCAGAGTCTGGTGCCCACGACCTCCACGAGCAGTCCCATACTTCGGACCAGGTGTTGATTATAAAGATGGGCTCACCCCACTACCAGAAAAGGGATTAACGACTTAT diff --git a/piranha/test/supp_data/supp_seqs2.fasta b/piranha/test/supp_data/supp_seqs2.fasta new file mode 100644 index 0000000..5ff6b08 --- /dev/null +++ b/piranha/test/supp_data/supp_seqs2.fasta @@ -0,0 +1,6 @@ +>1178|VDPV display_name=Sabin3-related +GGTATTGAAGATTTGATCCCTGAAGTTGCACAGGGCGCCCTAACTTTGTCACTCCCGAAGCAACAGGATAGCTTACGAGATACTAAGGCCAGTGGCCCGGGGCATTCCAAGGAGGTACCTGCACTCACTGCAGTCGAGACTGGAGCCACCAATCCTCTGGTACCATCCGACACAGTTCAAACGCGCCACGTAGTCCAACGACGCAGCAGGTCAGAGTCCACAATAGAATCATTCTTCGCACGCGGGGCGTGCGTCGCTATTATTGAGGTGGACAATGAACAACCAACCACCCGGGCACAGAAACTATTTGCCATGTGGCGCATTACATACAAAGATACAGTGCAGTTGCGCCGTAAGTTGGAGTTTTTCACATACTCTCGTTTTGACATGGAATTCACCTTCGTGGTAACCGCCAACTTCACCAACACTAATAATGGGCATGCACTCAACCAGGTGTACCAGTTAATGTACATNCCCCCAGGGGCACCCACACCAACGTCATGGGATGACTACACTTGGCAAACATCTTCCAACCCGTCCATATTTTACACCTATGGGGCTGCCCCGGCGCGAATCTCAGTGCCATACGTGGGGTTAGCCAATGCTTACTCGCACTTTTACGACGGCTTCGCCAAGGTGCCATTGAAGACAGATGCCAATGACCAGATTGGTGATTCCTTGTACAGCGCCATGACAGTTGATGACTTTGGTGTATTGGCAGTTCCGTTGTCAATGATCACAACCCCACTAAAGTAACCTCCAAAGTCCGCGTTTACATGAAACCCAAACACGTACGTGTCTGGTGCCCTAGACCGCCGCGTGCGGTTCCTTATTATGGACCAGGGGTGGACTATAGGTACAACTTGGAACCCTTATCTGAGAAAGGTTTGACCACATAT +>1179|VDPV display_name=Sabin3-related +GGTATTGAAGATTTGATCCCTGAAGTTGCACAGGGCGCCTTAACTTTGTCACTCCCGAAGCAACAGGATAGCTTACCTGATACTAAGGCCAGTGGCCCGGCGCATTCCAAGGAGGTACCTGCACTCACTGCAGTCGAGACTGGAGCCACCAATCCTCTGGTACCATCCGACACAGTTCAAACGCGCCACGTAGTCCAACGACGCAGCAGGTCAGAGTCCACAATAGAATCATTCTTCGCACGCGGGGCGTGCGTCGCTATTATTGAGGTGGACAATGAACAACCAACCACCCGGGCACAGAAACTATTTGCCATGTGGCGCATTACATACAAAGATACAGTGCAGTTGCGCCGTAAGTTGGAGTTTTTCACATACTCTCGTTTTGACATGGAATTCACCTTCGTGGTAACCGCCAACTTCACCAACACTAATAATGGGCATGCACTCAACCAGGTGTACCAGTTAATGTACATNCCCCCAGGGGCACCCACACCAACGTCATGGGATGACTACACTTGGCAAACATCTTCCAACCCGTCCATATTTTACACCTATGGGGCTGCCCCGGCGCGAATCTCAGTGCCATACGTGGGGTTAGCCAATGCTTACTCGCACTTTTACGACGGCTTCGCCAAGGTGCCATTGAAGACAGATGCCAATGACCAGATTGGTGATTCCTTGTACAGCGCCATGACAGTTGATGACTTTGGTGTATTGGCAGTTCCGTTGTCAATGATCACAACCCCACTAAAGTAACCTCCAAAGTCCGCGTTTACATGAAACCCAAACACGTACGTGTCTGGTGCCCTAGACCGCCGCGTGCGGTTCCTTATTATGGACCAGGGGTGGACTATAGGTACAACTTGGAACCCTTATCTGAGAAAGGTTTGACCACATAT +>1180|VDPV display_name=Sabin2-related +GGAATTGGTGACATGATTGAGGGGGCCGTTGAAGGGATTACTAAAAATGCATTGGCTCCCCCGACTTCCACCAATAGCCTGCCCGACACGAAGCCGAGCGGTCCAGCCCATTCCAAGGAGATACCTGCATTGACAGCCGTGGAGACAGGGGCTACCAATCCGTTGGTGCCTTCGGACACCGTGCAAACGCGCCATGTCATCCAGAGACGAACGCGATCAGAGTCCACGGTTGAGTCATTCTTTGCAAGAGGGGCTTGTGTGGCTATCATTGAGGTGGACAATGATGCACCGACAAAGCGCGCCAGCAGATTGTTTTCGGTTTGGAAAATAACTTACAAAGATACTGTTCAACTGAGACGCAAACTGGAATTTTTCACATATTCGAGATTTGACATGGAGTTCACTTTTGTGGTCACCTCAAACTACACTGATGCAAATAACGGACATGCATTGAACCAAGTTTATCAGATAATGTATATACCACCCGGAGCACCTATCCCTGGTAAATGGAATGACTACACGTGGCAGACGTCCTCTAACCCGTCGGTGTTTTACACCTATGGGGCGCCCCCAGCAAGAATATCAGTGCCCTACGTGGGAATTGCTAATGCGTATTCCCACTTCTATGATGGGTTTGCAAAAGTACCACTAGCGGATCAAGCCTCAACTGAAGGCGATTCGTTGTACGGTGCTGCCTCACTGAATGATTTTGGATCACTGGCTGTTCGCGTGGTAAATGACCACAACCCCACGCGGCTCACCTCCAAGATCAGAGTGTACATGAAGCCAAAGCATGTCAGAGTCTGGTGCCCACGACCTCCACGAGCAGTCCCATACTTCGGACCAGGTGTTGATTATAAAGATGGGCTCACCCCACTACCAGAAAAGGGATTAACGACTTAT diff --git a/piranha/utils/config.py b/piranha/utils/config.py index 173c3f1..6203a4c 100644 --- a/piranha/utils/config.py +++ b/piranha/utils/config.py @@ -56,6 +56,8 @@ # PHYLO KEYS KEY_RUN_PHYLO = "run_phylo" +KEY_UPDATE_LOCAL_DATABASE = "update_local_database" +KEY_SUPPLEMENTARY_DATADIR = "supplementary_datadir" KEY_SUPPLEMENTARY_SEQUENCES = "supplementary_sequences" KEY_SUPPLEMENTARY_METADATA = "supplementary_metadata" KEY_SUPPLEMENTARY_METADATA_ID_COLUMN = "supplementary_metadata_id_column" @@ -63,7 +65,6 @@ KEY_OUTGROUP_SEQUENCES = "outgroup_sequences" KEY_ANNOTATIONS = "annotations" KEY_TREE_ANNOTATIONS = "tree_annotations" -KEY_UPDATE_LOCAL_DATABASE = "update_local_database" KEY_LOCATION = "location" KEY_SAMPLE_SEQS = "sample_seqs" @@ -74,6 +75,9 @@ VALUE_SUPPLEMENTARY_METADATA_COLUMNS = [KEY_LOCATION,"lineage"] VALUE_SUPPLEMENTARY_METADATA_ID_COLUMN = "sequence_name" +# HAPLO KEYS +KEY_HAPLOTYPE_SAMPLE_SIZE = "haplotype_sample_size" + # REPORT KEYS KEY_SNIPIT_SVG="snipit_svg" KEY_SNP_SITES = "snp_sites" @@ -168,7 +172,7 @@ # report defaults VALUE_ORIENTATION="vertical" VALID_ORIENTATION=["vertical","horizontal"] -VALUE_RUNNAME="Nanopore" +VALUE_RUNNAME="polioDDNS" VALUE_COLOUR_MAP=["#e68781","#476970","#fbedac"] VALUE_COLOUR_THEME="#e68781"