test_example/config.yaml

# Config for analysis

# ----------------------------------------------------------------------------
# Relative paths from your top-level Snakefile to where you have the pipeline
# submodule cloned, and where you build the docs. Typically your top-level
# Snakefile will be in the root directory and paths will be `dms-vep-pipeline`
# and `./`, but in this example they are upstream from this subdirectory.
# ----------------------------------------------------------------------------
pipeline_path: ../  # typically will be `dms-vep-pipeline` for real pipelines
docs: ../docs  # typically will be `docs` for real pipelines

# ----------------------------------------------------------------------------
# Details on repo, used for docs. Change this to details for your project.
# ----------------------------------------------------------------------------
github_repo: dms-vep-pipeline
github_user: dms-vep
github_branch: main  # main branch for repo, assume renamed from master -> main
description: Deep mutational scanning (DMS) pipeline for a viral entry protein (VEP)
year: 2023
authors: Jesse Bloom

# ----------------------------------------------------------------------------
# Parameters related to building the codon variants
# ----------------------------------------------------------------------------

# There are two ways you can get the codon variants: download a pre-built codon
# variant table, or build them from PacBio CCSs yourself. If you are using
# pre-built ones, then specify the links to URLs giving the codon variants,
# gene sequence, and protein sequence for the pre-built table under `prebuilt_variants`.
# Otherwise, specify the other variables. If `prebuilt_variants` is specified, all
# other variables in this section are ignored (and do not need to be specified at all).

# If using pre-built variants specify URL for pre-built codon-variant table
# and gene (codon) sequence.
# prebuilt_variants: https://raw.githubusercontent.com/dms-vep/dms-vep-pipeline/main/test_example/results/variants/codon_variants.csv
# prebuilt_geneseq: https://raw.githubusercontent.com/dms-vep/dms-vep-pipeline/main/test_example/results/gene_sequence/codon.fasta

# Parameters for building PacBio CCS consensuses
max_ccs_error_rate: 1.0e-4  # only keep CCS if gene/barcode error rate <= this
consensus_params:  # parameters for building PacBio consensus sequences
  max_sub_diffs: null
  max_indel_diffs: null
  max_minor_sub_frac: 0.2
  max_minor_indel_frac: 0.2
  min_support: 2

# PacBio sequencing
pacbio_runs: data/PacBio_runs.csv  # PacBio sequencing data
pacbio_amplicon: data/PacBio_amplicon.gb  # Genbank file with PacBio amplicon
pacbio_amplicon_specs: data/PacBio_feature_parse_specs.yaml  # alignparse feature parsing
variant_tags:  # variant tags in PacBio amplicon, or "null" if no tags
  variant_tag5:
    variant_1: G
    variant_2: C
    wildtype: A
  variant_tag3:
    variant_1: G
    variant_2: C
    wildtype: A

# ----------------------------------------------------------------------------
# Parameters related to computing functional scores
# ----------------------------------------------------------------------------

# If you want to use prebuilt functional scores just taken from another repo, then
# set the `prebuilt_muteffects` variable below to the URL for pre-built effects of
# mutations on the observed phenotype. In that case, all other variables specified in this
# section will be ignored and can be deleted.
# prebuilt_muteffects: https://raw.githubusercontent.com/dms-vep/dms-vep-pipeline/main/test_example/results/muteffects_functional/muteffects_observed.csv

# Parameters for functional scores and global epistasis analysis, only needed if not using
# `prebuilt_muteffects`.
func_scores_pseudocount: 0.5  # pseudocount when computing functional scores
func_scores_min_wt_count: 1000  # require this many wildtype counts or error
func_scores_min_wt_frac:  0.001  # require this fraction of all counts for wildtype or error
# Only fit global epistasis models for variants with at least this many
# pre-selection counts and this fraction of total pre-selection counts
func_scores_min_preselection_counts: 10  # maybe make larger for real expts, say 20 to 50
func_scores_min_preselection_frac: 0.00005  # make smaller for large libraries, say 0.1 / (library size)
# when fitting global epistasis models, put this floor on functional scores. "null" or not specifying
# parameter means no floor.
func_scores_floor: null
# when plotting mut effects on viral entry, initially require mutation seen in this many variants:
muteffects_avg_method: median
# keyword arguments to `polyclonal.plot.lineplot_and_heatmap` for plotting functional effects
muteffects_plot_kwargs:
  addtl_slider_stats:
    times_seen: 3
  heatmap_max_at_least: 1
  heatmap_min_at_least: -1
  init_floor_at_zero: False

# ----------------------------------------------------------------------------
# Analysis parameters
# ----------------------------------------------------------------------------

# Parameters for processing Illumina barcodes
illumina_barcode_parser_params:
  upstream: AACTCCACTAGGAACATTTCTCTCTCGAATCTAGA
  downstream: ''
  minq: 20
  upstream_mismatch: 2

# Require samples to have an average of >= this many counts per variant.
# Error raised for any sample with < this many counts unless it is specified
# for `exclude_after_counts` in `barcode_runs`
min_avg_counts: 6  # consider value more like ~20 for real pipelines

# Parameters for antibody escape-probability calculation.
# Require neut standard to have at least this many counts
# and this much fraction of total counts or raise error:
prob_escape_min_neut_standard_count: 1000
prob_escape_min_neut_standard_frac: 0.001
# Only compute escape probabilities for variants with at least this many
# counts and this fraction of total counts in the no-antibody sample **OR**
# the indicated counts or fraction in the antibody sample. For the antibody
# counts, if unspecified or set to null then we don't use that threshold.
prob_escape_min_no_antibody_counts: 5  # maybe make bigger for real expts, say 20
prob_escape_min_no_antibody_frac: 0.00005  # make smaller for large libraries, say 0.1 / (library size)
prob_escape_min_antibody_counts: 10  # make bigger for real expts, say twice `prob_escape_min_no_antibody_counts`
prob_escape_min_antibody_frac: 0.001  # make smaller for real experiments, say 2 / (library size)
# for uncensored prob escape values, clip values to be <= this
prob_escape_uncensored_max: 5  # if not set, default to 5
# when averaging antibody escape values, take the "median" or "mean"?
escape_avg_method: median

# do we add formatting to the antibody escape and functional effects plots
format_antibody_escape_plots: true  # if option missing defaults to true
format_muteffects_plots: true  # if option missing defaults to true
# to override default, provide a value for antibody_escape_legend / muteffects_legend
antibody_escape_legend: null
muteffects_legend: null

# do we show antibody escape icXX plots?
show_antibody_escape_icXX_plots: false

# ----------------------------------------------------------------------------
# Input data
# ----------------------------------------------------------------------------

# Map sequential 1, 2, numbering of the protein to the desired
# final reference numbering scheme. Required to have columns named
# "sequential_site" and "reference_site". If you just want to number in
# sequential numbering for everything, just make both entries sequential.
# Can optionally have a column called "region" that assigns each site to a
# region of the protein (eg, domain like RBD or NTD).
site_numbering_map: data/site_numbering_map.csv

# Classify mutations into different categories, such as which ones are
# designed to be in the library. Should have columns "sequential_site",
# "amino_acid", and "mutation_type"
mutation_design_classification: data/mutation_design_classification.csv

# Neutralization standard barcodes. Should have column "barcode", and if specifying
# "neutralization_standard_name" in `barcode_runs`, should also have column "name"
neut_standard_barcodes: data/neutralization_standard_barcodes.csv

# Illumina barcode sequencing. If no barcode runs, set to "null"
barcode_runs: data/barcode_runs.csv

# configuration for polyclonal fitting
fit_polyclonal_threads: 1  # larger values makes faster if bootstrapping
polyclonal_config: data/polyclonal_config.yaml

# Spatial distances for polyclonal regularization. Set to "null" or just don't provide
# if you don't want spatial regularization. Should be data frame with columns
# "site_1", "site_2", and "distance", with sites in reference numbering scheme.
# Here we generate this file in `Snakefile` from a known PDB: you can do that,
# or create it manually.
spatial_distances: results/spatial_distances/7tov.csv

# ----------------------------------------------------------------------------
# Names of output directories / files
# ----------------------------------------------------------------------------
# directory with logs from running snakemake steps
logdir: results/logs

# gene sequence extracted from PacBio amplicon
gene_sequence_codon: results/gene_sequence/codon.fasta
gene_sequence_protein: results/gene_sequence/protein.fasta

# processing of PacBio CCSs to create codon-variant table
process_ccs_dir: results/process_ccs
aligned_ccs_file: results/process_ccs/CCSs_aligned_to_amplicon.csv
nt_variants: results/variants/nt_variants.csv
codon_variants: results/variants/codon_variants.csv

# barcode sequencing
processed_barcode_runs: results/barcode_runs/processed_barcode_runs.csv
barcode_counts_dir: results/barcode_runs/counts_by_sample
barcode_counts_invalid_dir: results/barcode_runs/counts_invalid_by_sample
barcode_fates_dir: results/barcode_runs/fates_by_sample

# variant counts
variant_counts_dir: results/variant_counts
variant_avg_counts_plot: results/variant_counts/avg_counts_per_variant.html
variant_avg_counts_csv: results/variant_counts/avg_counts_per_variant.csv

# escape probabilities for antibody selections
prob_escape_dir: results/prob_escape
antibody_selections: results/prob_escape/antibody_selections.csv

# polyclonal fitting directory
polyclonal_dir: results/polyclonal_fits

# antibody-escape values
escape_dir: results/antibody_escape

# functional scores for functional selections
func_score_dir: results/func_scores
functional_selections: results/func_scores/functional_selections.csv

# global epistasis fitting directory
globalepistasis_dir: results/globalepistasis_fits

# mutation effects on function (viral entry) averaged over replicates
muteffects_observed: results/muteffects_functional/muteffects_observed.csv
muteffects_latent: results/muteffects_functional/muteffects_latent.csv

# html documentation
docs_source_dir: results/docs_source