Skip to content

Commit

Permalink
add dms-viz JSON for VRC mAbs
Browse files Browse the repository at this point in the history
  • Loading branch information
jbloom committed Dec 30, 2024
1 parent 5f68a23 commit 47812b0
Show file tree
Hide file tree
Showing 15 changed files with 386 additions and 29,424 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ results/summaries/*

!results/dms-viz
results/dms-viz/*
!results/dms-viz/dms-viz.json
!results/dms-viz/*.json

node_modules/
!homepage/.vitepress/
Expand Down
4 changes: 2 additions & 2 deletions analysis_notebooks/configure_dms_viz.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@
"assert set(tooltip_cols).issubset(phenotypes.columns)\n",
"\n",
"filter_cols = [\"cell_entry\"]\n",
"filter_limits = {\"cell_entry\": [phenotypes[\"cell_entry\"].min(), -3, 0]}"
"filter_limits = {\"cell_entry\": [float(phenotypes[\"cell_entry\"].min()), -3, 0]}"
]
},
{
Expand Down Expand Up @@ -364,7 +364,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.10"
}
},
"nbformat": 4,
Expand Down
324 changes: 324 additions & 0 deletions analysis_notebooks/configure_dms_viz_mabs.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,324 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6f4b8ac5-ff40-4449-a326-d63651e0d828",
"metadata": {},
"source": [
"# Configure structure based analysis for `dms-viz`"
]
},
{
"cell_type": "markdown",
"id": "c371f68d-0e22-4188-8fca-11c78d6554d3",
"metadata": {},
"source": [
"Imports:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52955e16-8de2-4385-a985-932a8a6a0beb",
"metadata": {},
"outputs": [],
"source": [
"import gzip\n",
"import os\n",
"import requests\n",
"import subprocess\n",
"import textwrap\n",
"import warnings\n",
"\n",
"import Bio.PDB.PDBParser\n",
"import Bio.PDB.Polypeptide\n",
"\n",
"import matplotlib\n",
"\n",
"import pandas as pd\n",
"\n",
"import seaborn"
]
},
{
"cell_type": "markdown",
"id": "27c25a7d-3439-460c-84c0-13599a4fbb80",
"metadata": {},
"source": [
"Define variables. This next cell is tagged `parameters` for `papermill` parameterization:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0a26981c-1c55-4ed1-ab17-18ff6ca585b5",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"pdb_id = None\n",
"phenotypes_csv = None\n",
"per_antibody_escape_csv = None\n",
"site_numbering_map = None\n",
"dms_viz_json = None\n",
"dms_viz_sitemap = None\n",
"dms_viz_phenotypes = None\n",
"pdb_file = None\n",
"dms_viz_subdir = None"
]
},
{
"cell_type": "markdown",
"id": "36204609-74f7-488c-98dd-4da325d99f16",
"metadata": {},
"source": [
"Build the [sitemap](https://dms-viz.github.io/dms-viz-docs/preparing-data/data-requirements/#reference-site) used by `dms-viz`:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d5bfb2d-fed2-4213-86b9-e915f4ff9511",
"metadata": {},
"outputs": [],
"source": [
"phenotypes = pd.read_csv(phenotypes_csv)\n",
"\n",
"sitemap = (\n",
" pd.read_csv(site_numbering_map)\n",
" .sort_values(\"sequential_site\")\n",
" .assign(\n",
" HA_chain=lambda x: x[\"HA1_HA2_H5_site\"].str.split().str[1].str[1: -1],\n",
" first_ha2_site=lambda x: x.query(\"HA_chain == 'HA2'\")[\"mature_H5_site\"].min(),\n",
" protein_site=lambda x: x[\"mature_H5_site\"].where(\n",
" x[\"HA_chain\"] == \"HA1\",\n",
" x[\"mature_H5_site\"] - x[\"first_ha2_site\"] + 1,\n",
" ),\n",
" chains=lambda x: x[\"HA_chain\"].map({\"HA1\": \"A\", \"HA2\": \"B\"}),\n",
" )\n",
" .merge(phenotypes[[\"site\", \"wildtype\"]].drop_duplicates().rename(columns={\"site\": \"reference_site\"}))\n",
" [[\"sequential_site\", \"reference_site\", \"protein_site\", \"wildtype\", \"HA_chain\", \"chains\"]]\n",
")\n",
"\n",
"sitemap.to_csv(dms_viz_sitemap, index=False)"
]
},
{
"cell_type": "markdown",
"id": "fe8a3bb1-f557-43b2-9ce9-0418aaa63928",
"metadata": {},
"source": [
"Get the biological assembly (see https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies#Anchor-download) as the crystallographic unit doesn't correspond to that:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c2afd40-067f-462e-88d6-d3128dd57ff2",
"metadata": {},
"outputs": [],
"source": [
"r = requests.get(f\"https://files.rcsb.org/download/{pdb_id}.pdb1.gz\")\n",
"assert r.status_code == 200\n",
"pdb_content = gzip.decompress(r.content).decode(\"utf-8\")\n",
"with open(pdb_file, \"w\") as f:\n",
" f.write(pdb_content)"
]
},
{
"cell_type": "markdown",
"id": "b6bbef2e-e52d-4735-9e8e-d5401ebb9f36",
"metadata": {},
"source": [
"Check the sites mismatched between the sitemap and the protein structure in terms of residue identity:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e1ec117-3166-471d-8485-602ab384b50f",
"metadata": {},
"outputs": [],
"source": [
"with warnings.catch_warnings():\n",
" warnings.simplefilter(\"ignore\")\n",
" pdb_obj = Bio.PDB.PDBParser().get_structure(id=pdb_id, file=pdb_file)[0]\n",
"\n",
"records = []\n",
"for chain in [\"A\", \"B\"]:\n",
" for res in pdb_obj[chain].get_residues():\n",
" if not res.id[0].isspace():\n",
" continue\n",
" aa = Bio.PDB.Polypeptide.protein_letters_3to1[res.resname]\n",
" r = res.id[1]\n",
" records.append((chain, r, aa))\n",
"pdb_df = pd.DataFrame(records, columns=[\"chains\", \"protein_site\", \"pdb_aa\"])\n",
"\n",
"mismatched_sites = sitemap.merge(pdb_df, how=\"left\")\n",
"\n",
"print(\n",
" f\"Of {len(sitemap)} sites, {len(mismatched_sites.query('wildtype == pdb_aa'))} match, \"\n",
" f\"{len(mismatched_sites.query('pdb_aa.isnull()'))} are missing from PDB, and \"\n",
" f\"{len(mismatched_sites.query('pdb_aa.notnull()').query('wildtype != pdb_aa'))} differ.\"\n",
")\n",
"\n",
"print(\"Sites that differ:\")\n",
"display(mismatched_sites.query(\"pdb_aa.notnull() and (wildtype != pdb_aa)\").reset_index(drop=True))"
]
},
{
"cell_type": "markdown",
"id": "9f5dd45f-1627-425d-8cb2-63f5f103dcea",
"metadata": {},
"source": [
"Write the phenotypes after adding the antibodyd escape to a CSV file:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "562538b8-d250-40a9-ab01-6468bcccafa9",
"metadata": {},
"outputs": [],
"source": [
"per_antibody_escape = (\n",
" pd.read_csv(per_antibody_escape_csv)\n",
" .drop(columns=\"antibody_set\")\n",
" .merge(\n",
" phenotypes[\n",
" [\n",
" \"site\",\n",
" \"mutant\",\n",
" \"entry in 293T cells\",\n",
" \"sequential_site\",\n",
" \"mature_H5_site\",\n",
" \"HA1_HA2_H5_site\",\n",
" ]\n",
" ],\n",
" on=[\"site\", \"mutant\"],\n",
" validate=\"many_to_one\",\n",
" how=\"left\",\n",
" )\n",
" .rename(columns={\"entry in 293T cells\": \"cell_entry\"})\n",
" .assign(\n",
" mutation=lambda x: x[\"wildtype\"] + x[\"site\"].astype(str) + x[\"mutant\"],\n",
" )\n",
")\n",
"\n",
"antibodies = list(per_antibody_escape[\"antibody\"].unique())\n",
"\n",
"print(f\"Read escape for {antibodies=}\")\n",
"\n",
"print(f\"Writing the phenotypes to {dms_viz_phenotypes}\")\n",
"per_antibody_escape.to_csv(dms_viz_phenotypes, index=False, float_format=\"%.4g\")\n",
"\n",
"print(f\"{per_antibody_escape.columns=}\")"
]
},
{
"cell_type": "markdown",
"id": "e7ff0c20-4fc0-497d-b114-1f50ea042fbb",
"metadata": {},
"source": [
"Get enough colors:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce9e2f01-bb4a-48e9-a99c-3d5d73da0d72",
"metadata": {},
"outputs": [],
"source": [
"def get_hex_color_palette(num_colors):\n",
" colors = seaborn.color_palette(\"hls\", num_colors)\n",
" hex_colors = [matplotlib.colors.to_hex(color) for color in colors]\n",
" return hex_colors\n",
"\n",
"nconditions = len(antibodies)\n",
"if nconditions > 4:\n",
" colors = \",\".join(get_hex_color_palette(nconditions))\n",
"else:\n",
" colors = \"#0072B2,#CC79A7,#4C3549,#009E73\"\n",
"\n",
"print(f\"Using {colors=}\")\n",
"seaborn.palplot(seaborn.color_palette(colors.split(\",\")))"
]
},
{
"cell_type": "markdown",
"id": "00e1953c-bd3f-40fc-83a5-84233be74526",
"metadata": {},
"source": [
"Run [configure-dms-viz](https://dms-viz.github.io/dms-viz-docs/preparing-data/command-line-api/).\n",
"First, set up some options:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "22448e4d-a621-49cb-aabc-81baaf821f88",
"metadata": {},
"outputs": [],
"source": [
"tooltip_cols = [\n",
" \"mutation\",\n",
" \"sequential_site\",\n",
" \"mature_H5_site\",\n",
" \"cell_entry\",\n",
"]\n",
"\n",
"cmds = [\n",
" \"configure-dms-viz\", \"format\",\n",
" \"--name\", \"VRC antibodies\",\n",
" \"--input\", dms_viz_phenotypes,\n",
" \"--output\", dms_viz_json,\n",
" \"--structure\", pdb_file,\n",
" \"--metric\", \"escape\",\n",
" \"--condition\", \"antibody\",\n",
" \"--sitemap\", dms_viz_sitemap,\n",
" \"--colors\", colors,\n",
" \"--alphabet\", \"RKHDEQNSTYWFAILMVGPC\",\n",
" \"--summary-stat\", \"sum\",\n",
" \"--floor\", \"True\",\n",
" \"--tooltip-cols\", str({c: c.replace(\"_\", \" \") for c in tooltip_cols}),\n",
" \"--filter-cols\", \"{'cell_entry': 'cell entry'}\",\n",
" \"--filter-limits\", f\"{{'cell_entry': [{float(per_antibody_escape['cell_entry'].min())}, -3, 0]}}\",\n",
" \"--title\", \"H5 HA escape from VRC antibodies as measured by pseudovirus deep mutational scanning\",\n",
" \"--description\", \"H5 HA escape from VRC antibodies as measured by pseudovirus deep mutational scanning\",\n",
"]\n",
"\n",
"print(f\"Running the following commands:\\n{cmds}\")\n",
"subprocess.run(cmds, check=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 47812b0

Please sign in to comment.