forked from Arcadia-Science/ProteinCartography
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yml
121 lines (99 loc) · 5.27 KB
/
config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# ------------------------------------------------------------------------------------------------
# ProteinCartography Base Configuration
# ------------------------------------------------------------------------------------------------
# Config parameters in this file are used as defaults by the pipeline.
# To override these defaults, create a new config file containing only the overridden parameters
# and pass your new file to Snakemake using the --configfile flag.
# (the parameters in the new config file will overwrite the defaults listed here)
# ------------------------------------------------------------------------------------------------
# Basic settings
# ------------------------------------------------------------------------------------------------
# The mode in which to run the pipeline (either "search" or "cluster")
mode: "search"
# The input and output directories
# (usually, we place the output directory inside the input directory)
input_dir: "input/"
output_dir: "input/output/"
# The name for the analysis; this is used to prefix the names of the final output files
analysis_name: "example"
# ------------------------------------------------------------------------------------------------
# Search-mode settings
# These settings are required in 'search' mode and ignored in 'cluster' mode
# ------------------------------------------------------------------------------------------------
# The url of the Foldseek server to use
# (do not change this unless you are running your own custom server)
foldseek_server_url: "https://search.foldseek.com"
# The names of the Foldseek databases to query
# To avoid overloading the Foldseek server, we recommend limiting your search
# to only the databases relevant to your query.
foldseek_databases:
- "afdb50"
- "afdb-swissprot"
- "afdb-proteome"
# The maximum number of Foldseek hits to retain
# (the foldseek API does not provide a way to limit the number of Foldseek hits returned,
# so this is used to truncate the list of retrieved hits)
max_foldseek_hits: 3000
# The maximum number of BLAST hits to retrieve
max_blast_hits: 3000
# The blast word size: the length of exact matches to start the candidate alignments.
blast_word_size: 5
# The blast_word_size to use if the initial attempt with blast_word_size has failed.
blast_word_size_backoff: 6
# The maximal e-value to use for the blast search.
blast_evalue: 1.0
# The number of times to try calling blast before exiting with an error.
blast_num_attempts: 3
# Any additional metadata fields to download from UniProt
uniprot_additional_fields: []
# The maximum number of total structures to download (combined BLAST + Foldseek hits)
# (this is the final number of structures that will be used for analysis)
max_structures: 5000
# The maximum and minimum protein lengths to use to filter the hits from foldseek and blast,
# prior to downloading structures from AlphaFold.
# Setting either value to 0 removes that bound from the filtering;
# if both are nonzero, then min_length must be less than max_length.
min_length: 0
max_length: 0
# ------------------------------------------------------------------------------------------------
# Cluster-mode settings
# These settings are required in 'cluster' mode and ignored in 'search' mode
# ------------------------------------------------------------------------------------------------
# The path to a TSV file of protein metadata (usually retrieved from Uniprot)
# that is used for visualization.
# Uses the same columns as the features_override_file (see below)
# Note: the path to this file should be relative to the input_dir specified above
features_file: "uniprot_features.tsv"
# An optional list of key proteins to highlight in the analysis plots,
# similar to how the input proteins are highlighted in 'search' mode
# (uncomment the lines below and add one or more protids to use this parameter)
# key_protids:
# - 'P60709'
# ------------------------------------------------------------------------------------------------
# Features-related settings
# ------------------------------------------------------------------------------------------------
# The path to a TSV file with protein metadata similar to what would be pulled from UniProt
# this is used to provide metadata for user-provided protein structures
# See the README.md for an explanation of the expected columns.
# (if this file doesn't exist, this parameter is ignored)
features_override_file: "features_override.tsv"
# ------------------------------------------------------------------------------------------------
# Plotting-related settings
# ------------------------------------------------------------------------------------------------
# The list of plotting modes to use (allowed values: "pca", "tsne", "umap", "pca_tsne", "pca_umap")
plotting_modes:
- "pca_tsne"
- "pca_umap"
# The broad taxonomic groups that will be categorized in plots
# Also accepts 'bac' which displays some large bacterial taxonomic groups
# (see README.md for more details)
taxon_focus: 'euk'
# ------------------------------------------------------------------------------------------------
# Resource execution settings
# ------------------------------------------------------------------------------------------------
resources:
mem_mb: 16 * 1000
cores: 8
max-jobs-per-second: 1
max-status-checks-per-second: 10
local-cores: 10