-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yaml
139 lines (117 loc) · 6.79 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# Config for analysis
# ----------------------------------------------------------------------------
# Relative paths from this directory containing the configuration (and where
# you will run the pipeline) to the subdirectory where the pipeline submodule
# is cloned, and where you build the docs. Typically your top-level
# Snakefile will be in the root directory and paths will be `dms-vep-pipeline`
# and `./`, but in this example they are upstream from this subdirectory.
# ----------------------------------------------------------------------------
pipeline_path: dms-vep-pipeline-3 # typically will be `dms-vep-pipeline` for real pipelines
docs: docs # typically will be `docs` for real pipelines
# ----------------------------------------------------------------------------
# Build VitePress homepage
# ----------------------------------------------------------------------------
homepage: homepage/public
build_vitepress_homepage: true
# ----------------------------------------------------------------------------
# Details on repo, used for docs. Change this to details for your project.
# ----------------------------------------------------------------------------
# Name of your main GitHub repo, eg https://github.com/<my_organization>/<my_repo>
github_repo_url: https://github.com/dms-vep/RABV_Pasteur_G_DMS
# GitHub blob path to where results files are stored. Typically "{repo}/blob/{branch}",
# the "test_example" below is specific to this test example being within the pipeline
# and should not be needed for other pipelines.
github_blob_url: https://github.com/dms-vep/RABV_Pasteur_G_DMS/blob/main
# Some descriptions and metadata about the analysis.
description: Deep mutational scanning of rabies G (Pasteur strain)
year: 2024
authors: "[Aditham et al](https://www.biorxiv.org/content/10.1101/2024.12.17.628970v1)"
# ----------------------------------------------------------------------------
# Set the `use_precomputed_barcode_counts` option to `true` if you want to
# re-run this pipeline from the barcode counts already calculated from the
# FASTQs rather than re-running the barcode counting.
# ----------------------------------------------------------------------------
use_precomputed_barcode_counts: false
# ----------------------------------------------------------------------------
# Site numbering, mutation classification, and neut standards
# ----------------------------------------------------------------------------
# Map sequential 1, 2, numbering of the protein to the desired
# final reference numbering scheme. Required to have columns named
# "sequential_site" and "reference_site". If you just want to number in
# sequential numbering for everything, just make both entries sequential.
# Should also have a column called "region" that assigns each site to a
# region of the protein (eg, domain like RBD or NTD).
site_numbering_map: data/site_numbering_map.csv
# Classify mutations into different categories, such as which ones are
# designed to be in the library. If you don't have different categories of
# designed mutations, just include all of the intended mutations with
# mutation type as "designed". The CSV specified below must have columns
# named "mutation_type", "amino_acid" or "mutant_aa", and either
# "reference_site" or "sequential_site" as specified by `site_col` key.
mutation_design_classification:
csv: data/designed_mutations.csv # CSV with data
site_col: sequential_site # site column, should be reference_site or sequential_site
# Neutralization standard barcodes. Should have columns "barcode" and "name"
# (giving name of this neutralization standard set). Can be empty CSV with
# those columns if no neutralization standards.
neut_standard_barcodes: data/neutralization_standard_barcodes.csv
# ----------------------------------------------------------------------------
# Parameters related to building barcode-variant lookup table
# ----------------------------------------------------------------------------
# There are two ways you can get the codon variants: download a pre-built codon
# variant table, or build them from PacBio CCSs yourself.
# If using pre-built variants specify URL for pre-built codon-variant table and
# gene (codon) sequence (beginning with "http" or "ftp") or just path to file.
# If these next two variables are "null" instead, then the variants are built
# from scratch using parameters below.
prebuilt_variants: null
prebuilt_geneseq: null
# Parameters for building variants from PacBio sequencing, only needed if
# not using pre-built variants.
pacbio_runs: data/PacBio_runs.csv # PacBio sequencing data
pacbio_amplicon: data/PacBio_amplicon.gb # Genbank file with PacBio amplicon
pacbio_amplicon_specs: data/PacBio_feature_parse_specs.yaml # alignparse feature parsing
variant_tags: # variant tags in PacBio amplicon, or "null" if no tags
variant_tag5:
variant_1: G
variant_2: C
wildtype: A
variant_tag3:
variant_1: G
variant_2: C
wildtype: A
max_ccs_error_rate: 1.0e-4 # only keep CCS if gene/barcode error rate <= this
consensus_params: # parameters for building PacBio consensus sequences
max_sub_diffs: null
max_indel_diffs: null
max_minor_sub_frac: 0.2
max_minor_indel_frac: 0.2
#max_minor_greater_or_equal: True
min_support: 3
# created files with sequences of parental protein
gene_sequence_codon: data/gene_sequence/codon.fasta
gene_sequence_protein: data/gene_sequence/protein.fasta
# created file with barcode-variant lookup table
codon_variants: results/variants/codon_variants.csv
# ----------------------------------------------------------------------------
# Parameters related to counting the variants from barcode sequencing
# ----------------------------------------------------------------------------
barcode_runs: data/barcode_runs.csv # Illumina barcode runs, set to null if no runs
duplicate_fastq_R1: warn
# keyword parameters for `dms_variants.illuminabarcodeparser.IlluminaBarcodeParser`
# https://jbloomlab.github.io/dms_variants/dms_variants.illuminabarcodeparser.html#dms_variants.illuminabarcodeparser.IlluminaBarcodeParser
illumina_barcode_parser_params:
upstream: ACTCCACTAGGAACATTTCTCTCTCGAATCTAGA
downstream: ''
minq: 20
upstream_mismatch: 2
# ----------------------------------------------------------------------------
# Configuration related to other analyses
# ----------------------------------------------------------------------------
# For each variable, set to "null" or just don't provide if you aren't doing that type
# of analysis. Otherwise provide path to configuration for that analysis.
func_effects_config: data/func_effects_config.yml # Functional effects of mutations
antibody_escape_config: data/antibody_escape_config.yml # Antibody/serum escape
summaries_config: data/summaries_config.yml # Summaries across assays
# Custom rules configuration for `dms-viz`
dms_viz_config: data/dms_viz_config.yml