data/func_effects_config.yml

# Configurations for determining functional effects of mutations

# --------------------------------------------------------------------------------------
# Configuration for computing functional scores and fitting global epistasis models
# to each individual selection experiment
# --------------------------------------------------------------------------------------

# Define default parameters for computing functional scores.
# Can be provided to individual selections using the alias operator (*)
func_scores_params_default: &func_scores_params_default
  pseudocount: 0.5
  # `min_wt_count` and `min_wt_frac` are designed to raise errors if selection experiment
  # has excessively low wildtype counts (which will lead to inaccurate estimates).
  min_wt_count: 10000  # error if not at least this many wildtype counts
  min_wt_frac: 0.001  # error if not at least this fraction of counts for wildtype
  # `min_pre_selection_count` and `min_pre_selection_frac` drop variants with low counts
  # or frequencies that may not have sufficient statistics for good estimates.
  min_pre_selection_count: 25  # drop variants < this many pre-selection counts
  # !!! Make min_pre_selection_frac LARGER for larger libraries, such 0.1 / (library size) !!!
  min_pre_selection_frac: 0.000002  # drop variants that are < this fraction of all counts

# Define default parameters for fitting global epistasis models to each
# individual selection to get mutationo effects.
global_epistasis_params: &global_epistasis_params
  # How to clip functional scores at upper / lower bounds. Allowable values:
  #  - median_stop: median func score of all variants with stop codons
  #  - null: no clipping
  #  - a number: clip at this number
  clip_lower: median_stop
  clip_upper: null
  # Do we collapse barcodes with same variant?
  collapse_identical_variants: false  # {false, mean, median}

# Define how to compute functional scores. Each key is a selection experiment.
# A global epistasis model is used to get mutational effects for each selection.
# Recommended naming of selection experiments is:
#   <library>-<post-selection sample date as YYMMDD>-<description>-<replicate>
# Each selection experiment should in turn provide the following keys:
#  - `post_selection_sample`: sample after selection (eg, infection of cells)
#  - `pre_selection_sample`: sample pre selection (eg, initial variant frequencies)
#  - `func_score_params`: parameters for computing functional scores
#  - `global_epistasis_params`: parameters for fitting global epistasis model
func_scores:
  Lib1-230614_high_ACE2:
    pre_selection_sample: Lib1-230614-VSVG_control_293T
    post_selection_sample: Lib1-230614-no-antibody_control_highACE2
    func_score_params: *func_scores_params_default
    global_epistasis_params: *global_epistasis_params
  Lib2-230614_high_ACE2:
    pre_selection_sample: Lib2-230614-VSVG_control_293T
    post_selection_sample: Lib2-230614-no-antibody_control_highACE2
    func_score_params: *func_scores_params_default
    global_epistasis_params: *global_epistasis_params
  Lib1-230614_medium_ACE2:
    pre_selection_sample: Lib1-230614-VSVG_control_293T
    post_selection_sample: Lib1-230614-no-antibody_control_mediumACE2
    func_score_params: *func_scores_params_default
    global_epistasis_params: *global_epistasis_params
  Lib2-230614_medium_ACE2:
    pre_selection_sample: Lib2-230614-VSVG_control_293T
    post_selection_sample: Lib2-230614-no-antibody_control_mediumACE2
    func_score_params: *func_scores_params_default
    global_epistasis_params: *global_epistasis_params

# --------------------------------------------------------------------------------------
# Configuration for averaging mutation functional effects across selections and plotting
# them.
# --------------------------------------------------------------------------------------

# Average/plot mutation functional effects from different selections w `avg_func_effects`
# Each key is a condition which has the following values:
#  - `selections`: list of selections for which we average mutation functional effects
#  - `avg_method`: how to average across the selections, options are "median" and "mean"
#  - `floor_for_effect_std`: floor values at this before computing standard deviation.
#  - `per_selection_tooltips`: whether to show per-selection effects via tooltips
#  - `plot_kwargs`: keyword arguments passed to `polyclonal.plot.lineplot_and_heatmap`
#  - `title`: title of plot. Will be suffixed with "(latent phenotype)" or ("functional score)"
#  - `legend`: legend added to plot

# Define some defaults for each condition, used via the merge (<<) operator.
avg_func_effects_default: &avg_func_effects_default
  avg_method: median
  floor_for_effect_std: -2.5
  per_selection_tooltips: true
  plot_kwargs:
    addtl_slider_stats:
      times_seen: 3
      effect_std: 1.6
    addtl_slider_stats_as_max: [effect_std]
    heatmap_max_at_least: 1
    heatmap_min_at_least: -1
    heatmap_min_fixed: -2.5
    init_floor_at_zero: false
    init_site_statistic: mean
    site_zoom_bar_color_col: region  # supplied in the `site_numbering_map`
    slider_binding_range_kwargs:
      times_seen:
        step: 1
        min: 1
        max: 25
      n_selections:
        step: 1
    sites_to_show: 
  legend: |
    Interactive plot of the effects of mutations. Negative values indicated deleterious mutations,
    positive values indicate beneficial mutations for the measured phenotype.

    Use the site zoom bar at the top to zoom in on specific sites. The line plot shows a summary
    statistic indicating the effects of mutations at each site. The heat map shows the effects of
    individual mutations, with parental amino-acid identities indicated by x and gray
    indicating non-measured mutations.

    You can mouse over points to get details about individual measurements, include measurements
    in individual selection experiments.

    The options at the bottom of the plot let you modify the display, such as by selecting how
    many different variants a mutation must be seen in to be shown (*minimum times_seen*),
    how many different experimental selections the mutation was measured in
    (*minimum n_selections*), what site summary statistic to show, etc.

    The *minimum max of effect* at site is useful to select the sites where mutations have
    the most positive functional effects.

# Define the functional effect conditions to average
avg_func_effects:
  293T_high_ACE2_entry:
    <<: *avg_func_effects_default
    title: Mutation effects on entry into 293T cells expressing high ACE2
    selections:
      - Lib1-230614_high_ACE2
      - Lib2-230614_high_ACE2
  293T_medium_ACE2_entry:
    <<: *avg_func_effects_default
    title: Mutation effects on entry into 293T cells expressing medium ACE2
    selections:
      - Lib1-230614_medium_ACE2
      - Lib2-230614_medium_ACE2

# --------------------------------------------------------------------------------------
# Configuration for func effect shifts from `multidms` models comparing conditions.
# --------------------------------------------------------------------------------------

# Define `func_effect_shifts` comparisons of different conditions. Each key is a comparison.
# Each comparison should have the following keys:
#  - `conditions` : dict keyed by condition names with values func scores from above.
#  - `reference` : name of the reference condition, must be in `conditions`
#  - `clip_lower` : how to clip functional scores at lower bound
#  - `clip_upper` : how to clip functional scores at upper bound
#  - `collapse_identical_barcodes` : do we collapse identical barcodes?
#  - `latent_offset` : is there a condition specific offset in latent effects
#  - `lasso_shifts` : list of strength of lasso regularization on shifts
# If you are not doing comparisons, just set `func_effect_shifts` to `null` or
# leave it out altogether.

# default settings for `func_effect_shifts` comparisons
func_effect_shifts_default: &func_effect_shifts_default
  # How to clip functional scores at upper / lower bounds. Allowable values:
  #  - median_stop: median func score of all variants with stop codons
  #  - null: no clipping
  #  - a number: clip at this number
  clip_lower: median_stop
  clip_upper: null
  # Do we collapse barcodes with same variant?
  collapse_identical_variants: false  # {false, mean, median}
  # Do we have offset in latent effects for different conditions? `alpha_d` in `multidms`
  latent_offset: true
  # Strength of lasso regularization on shifts in `multidms`. You can try a range of
  # values and then pick a best one at the averaging step. 
  lasso_shifts: [0.00001, 0.00005, 0.0001, 0.0002, 0.001]

func_effect_shifts:
 Lib1-230614_ACE2_expression:
  <<: *func_effect_shifts_default
  reference: high_ACE2
  conditions:
    high_ACE2: Lib1-230614_high_ACE2
    medium_ACE2: Lib1-230614_medium_ACE2
 Lib2-230614_ACE2_expression:
  <<: *func_effect_shifts_default
  reference: high_ACE2
  conditions:
    high_ACE2: Lib2-230614_high_ACE2
    medium_ACE2: Lib2-230614_medium_ACE2

# --------------------------------------------------------------------------------------
# Configuration for averaging func effect shifts from multiple comparisons.
# --------------------------------------------------------------------------------------

# Define `func_effect_shifts` comparisons to average. `avg_func_effect_shifts` is keyed
# by name of averaged comparisons. Keys within that are:
#  - `title`: title of comparisons being averaged
#  - `comparisons`: list of comparisons from `func_effect_shifts`, must have same
#                   condition names, lasso shifts, and reference.
#  - `lasso_shift`: the single lasso shift to use for the final averaged values.
#  - `avg_method`: how to average across the selections, options are "median" and "mean"
#  - `plot_kwargs`: keyword arguments passed to `polyclonal.plot.lineplot_and_heatmap`
#  - `title`: title of plot. Will be suffixed with "(latent phenotype)" or ("functional score)"
#  - `legend`: legend added to plot

# Define some defaults for each condition, used via the merge (<<) operator.
avg_func_effect_shifts_default: &avg_func_effect_shifts_default
  avg_method: median
  per_comparison_tooltips: true
  plot_kwargs:
    addtl_slider_stats:
      times_seen: 3
    heatmap_max_at_least: 0.5
    heatmap_min_at_least: -0.5
    init_floor_at_zero: false
    init_site_statistic: mean
    site_zoom_bar_color_col: region  # supplied in the `site_numbering_map`
    slider_binding_range_kwargs:
      times_seen:
        step: 1
        min: 1
        max: 25
      n_comparisons:
        step: 1
  legend: |
    Interactive plot of shifts in effects of mutations. Negative values indicate mutations that
    have shifted to become more deleterious; positive values indicate mutations shifted to
    be more beneficial.

    Use the site zoom bar at the top to zoom in on specific sites. The line plot shows a summary
    statistic indicating the effects of mutations at each site. The heat map shows the effects of
    individual mutations, with parental amino-acid identities indicated by x and gray
    indicating non-measured mutations.

    You can mouse over points to get details about individual measurements, including measurements
    in individual comparisons.

    The options at the bottom of the plot let you modify the display, such as by selecting how
    many different variants a mutation must be seen in to be shown (*minimum times_seen*),
    how many different experimental comparisons the mutation was measured in
    (*minimum n_comparisons*), what site summary statistic to show, etc.

# Define the functional effect conditions to average
avg_func_effect_shifts:
  target_cell_ACE2_expression_comparison:
    <<: *avg_func_effect_shifts_default
    title: Comparison of functional effects measured on 293T cells expressing different levels of ACE2
    comparisons:
      - Lib1-230614_ACE2_expression
      - Lib2-230614_ACE2_expression
    lasso_shift: 0.0001