-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yaml
221 lines (186 loc) · 10.6 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# Config for analysis
# ----------------------------------------------------------------------------
# Relative paths from your top-level Snakefile to where you have the pipeline
# submodule cloned, and where you build the docs. Typically your top-level
# Snakefile will be in the root directory and paths will be `dms-vep-pipeline`
# and `./`, but in this example they are upstream from this subdirectory.
# ----------------------------------------------------------------------------
pipeline_path: ../ # typically will be `dms-vep-pipeline` for real pipelines
docs: ../docs # typically will be `docs` for real pipelines
# ----------------------------------------------------------------------------
# Details on repo, used for docs. Change this to details for your project.
# ----------------------------------------------------------------------------
github_repo: dms-vep-pipeline
github_user: dms-vep
github_branch: main # main branch for repo, assume renamed from master -> main
description: Deep mutational scanning (DMS) pipeline for a viral entry protein (VEP)
year: 2023
authors: Jesse Bloom
# ----------------------------------------------------------------------------
# Parameters related to building the codon variants
# ----------------------------------------------------------------------------
# There are two ways you can get the codon variants: download a pre-built codon
# variant table, or build them from PacBio CCSs yourself. If you are using
# pre-built ones, then specify the links to URLs giving the codon variants,
# gene sequence, and protein sequence for the pre-built table under `prebuilt_variants`.
# Otherwise, specify the other variables. If `prebuilt_variants` is specified, all
# other variables in this section are ignored (and do not need to be specified at all).
# If using pre-built variants specify URL for pre-built codon-variant table
# and gene (codon) sequence.
# prebuilt_variants: https://raw.githubusercontent.com/dms-vep/dms-vep-pipeline/main/test_example/results/variants/codon_variants.csv
# prebuilt_geneseq: https://raw.githubusercontent.com/dms-vep/dms-vep-pipeline/main/test_example/results/gene_sequence/codon.fasta
# Parameters for building PacBio CCS consensuses
max_ccs_error_rate: 1.0e-4 # only keep CCS if gene/barcode error rate <= this
consensus_params: # parameters for building PacBio consensus sequences
max_sub_diffs: null
max_indel_diffs: null
max_minor_sub_frac: 0.2
max_minor_indel_frac: 0.2
min_support: 2
# PacBio sequencing
pacbio_runs: data/PacBio_runs.csv # PacBio sequencing data
pacbio_amplicon: data/PacBio_amplicon.gb # Genbank file with PacBio amplicon
pacbio_amplicon_specs: data/PacBio_feature_parse_specs.yaml # alignparse feature parsing
variant_tags: # variant tags in PacBio amplicon, or "null" if no tags
variant_tag5:
variant_1: G
variant_2: C
wildtype: A
variant_tag3:
variant_1: G
variant_2: C
wildtype: A
# ----------------------------------------------------------------------------
# Parameters related to computing functional scores
# ----------------------------------------------------------------------------
# If you want to use prebuilt functional scores just taken from another repo, then
# set the `prebuilt_muteffects` variable below to the URL for pre-built effects of
# mutations on the observed phenotype. In that case, all other variables specified in this
# section will be ignored and can be deleted.
# prebuilt_muteffects: https://raw.githubusercontent.com/dms-vep/dms-vep-pipeline/main/test_example/results/muteffects_functional/muteffects_observed.csv
# Parameters for functional scores and global epistasis analysis, only needed if not using
# `prebuilt_muteffects`.
func_scores_pseudocount: 0.5 # pseudocount when computing functional scores
func_scores_min_wt_count: 1000 # require this many wildtype counts or error
func_scores_min_wt_frac: 0.001 # require this fraction of all counts for wildtype or error
# Only fit global epistasis models for variants with at least this many
# pre-selection counts and this fraction of total pre-selection counts
func_scores_min_preselection_counts: 10 # maybe make larger for real expts, say 20 to 50
func_scores_min_preselection_frac: 0.00005 # make smaller for large libraries, say 0.1 / (library size)
# when fitting global epistasis models, put this floor on functional scores. "null" or not specifying
# parameter means no floor.
func_scores_floor: null
# when plotting mut effects on viral entry, initially require mutation seen in this many variants:
muteffects_avg_method: median
# keyword arguments to `polyclonal.plot.lineplot_and_heatmap` for plotting functional effects
muteffects_plot_kwargs:
addtl_slider_stats:
times_seen: 3
heatmap_max_at_least: 1
heatmap_min_at_least: -1
init_floor_at_zero: False
# ----------------------------------------------------------------------------
# Analysis parameters
# ----------------------------------------------------------------------------
# Parameters for processing Illumina barcodes
illumina_barcode_parser_params:
upstream: AACTCCACTAGGAACATTTCTCTCTCGAATCTAGA
downstream: ''
minq: 20
upstream_mismatch: 2
# Require samples to have an average of >= this many counts per variant.
# Error raised for any sample with < this many counts unless it is specified
# for `exclude_after_counts` in `barcode_runs`
min_avg_counts: 6 # consider value more like ~20 for real pipelines
# Parameters for antibody escape-probability calculation.
# Require neut standard to have at least this many counts
# and this much fraction of total counts or raise error:
prob_escape_min_neut_standard_count: 1000
prob_escape_min_neut_standard_frac: 0.001
# Only compute escape probabilities for variants with at least this many
# counts and this fraction of total counts in the no-antibody sample **OR**
# the indicated counts or fraction in the antibody sample. For the antibody
# counts, if unspecified or set to null then we don't use that threshold.
prob_escape_min_no_antibody_counts: 5 # maybe make bigger for real expts, say 20
prob_escape_min_no_antibody_frac: 0.00005 # make smaller for large libraries, say 0.1 / (library size)
prob_escape_min_antibody_counts: 10 # make bigger for real expts, say twice `prob_escape_min_no_antibody_counts`
prob_escape_min_antibody_frac: 0.001 # make smaller for real experiments, say 2 / (library size)
# for uncensored prob escape values, clip values to be <= this
prob_escape_uncensored_max: 5 # if not set, default to 5
# when averaging antibody escape values, take the "median" or "mean"?
escape_avg_method: median
# do we add formatting to the antibody escape and functional effects plots
format_antibody_escape_plots: true # if option missing defaults to true
format_muteffects_plots: true # if option missing defaults to true
# to override default, provide a value for antibody_escape_legend / muteffects_legend
antibody_escape_legend: null
muteffects_legend: null
# do we show antibody escape icXX plots?
show_antibody_escape_icXX_plots: false
# ----------------------------------------------------------------------------
# Input data
# ----------------------------------------------------------------------------
# Map sequential 1, 2, numbering of the protein to the desired
# final reference numbering scheme. Required to have columns named
# "sequential_site" and "reference_site". If you just want to number in
# sequential numbering for everything, just make both entries sequential.
# Can optionally have a column called "region" that assigns each site to a
# region of the protein (eg, domain like RBD or NTD).
site_numbering_map: data/site_numbering_map.csv
# Classify mutations into different categories, such as which ones are
# designed to be in the library. Should have columns "sequential_site",
# "amino_acid", and "mutation_type"
mutation_design_classification: data/mutation_design_classification.csv
# Neutralization standard barcodes. Should have column "barcode", and if specifying
# "neutralization_standard_name" in `barcode_runs`, should also have column "name"
neut_standard_barcodes: data/neutralization_standard_barcodes.csv
# Illumina barcode sequencing. If no barcode runs, set to "null"
barcode_runs: data/barcode_runs.csv
# configuration for polyclonal fitting
fit_polyclonal_threads: 1 # larger values makes faster if bootstrapping
polyclonal_config: data/polyclonal_config.yaml
# Spatial distances for polyclonal regularization. Set to "null" or just don't provide
# if you don't want spatial regularization. Should be data frame with columns
# "site_1", "site_2", and "distance", with sites in reference numbering scheme.
# Here we generate this file in `Snakefile` from a known PDB: you can do that,
# or create it manually.
spatial_distances: results/spatial_distances/7tov.csv
# ----------------------------------------------------------------------------
# Names of output directories / files
# ----------------------------------------------------------------------------
# directory with logs from running snakemake steps
logdir: results/logs
# gene sequence extracted from PacBio amplicon
gene_sequence_codon: results/gene_sequence/codon.fasta
gene_sequence_protein: results/gene_sequence/protein.fasta
# processing of PacBio CCSs to create codon-variant table
process_ccs_dir: results/process_ccs
aligned_ccs_file: results/process_ccs/CCSs_aligned_to_amplicon.csv
nt_variants: results/variants/nt_variants.csv
codon_variants: results/variants/codon_variants.csv
# barcode sequencing
processed_barcode_runs: results/barcode_runs/processed_barcode_runs.csv
barcode_counts_dir: results/barcode_runs/counts_by_sample
barcode_counts_invalid_dir: results/barcode_runs/counts_invalid_by_sample
barcode_fates_dir: results/barcode_runs/fates_by_sample
# variant counts
variant_counts_dir: results/variant_counts
variant_avg_counts_plot: results/variant_counts/avg_counts_per_variant.html
variant_avg_counts_csv: results/variant_counts/avg_counts_per_variant.csv
# escape probabilities for antibody selections
prob_escape_dir: results/prob_escape
antibody_selections: results/prob_escape/antibody_selections.csv
# polyclonal fitting directory
polyclonal_dir: results/polyclonal_fits
# antibody-escape values
escape_dir: results/antibody_escape
# functional scores for functional selections
func_score_dir: results/func_scores
functional_selections: results/func_scores/functional_selections.csv
# global epistasis fitting directory
globalepistasis_dir: results/globalepistasis_fits
# mutation effects on function (viral entry) averaged over replicates
muteffects_observed: results/muteffects_functional/muteffects_observed.csv
muteffects_latent: results/muteffects_functional/muteffects_latent.csv
# html documentation
docs_source_dir: results/docs_source