add dms-viz JSON for VRC mAbs

dms-vep · Dec 30, 2024 · 47812b0 · 47812b0
1 parent 5f68a23
commit 47812b0
Show file tree

Hide file tree

Showing 15 changed files with 386 additions and 29,424 deletions.
diff --git a/.gitignore b/.gitignore
@@ -86,7 +86,7 @@ results/summaries/*
 
 !results/dms-viz
 results/dms-viz/*
-!results/dms-viz/dms-viz.json
+!results/dms-viz/*.json
 
 node_modules/
 !homepage/.vitepress/

diff --git a/analysis_notebooks/configure_dms_viz.ipynb b/analysis_notebooks/configure_dms_viz.ipynb
@@ -266,7 +266,7 @@
     "assert set(tooltip_cols).issubset(phenotypes.columns)\n",
     "\n",
     "filter_cols = [\"cell_entry\"]\n",
-    "filter_limits = {\"cell_entry\": [phenotypes[\"cell_entry\"].min(), -3, 0]}"
+    "filter_limits = {\"cell_entry\": [float(phenotypes[\"cell_entry\"].min()), -3, 0]}"
    ]
   },
   {
@@ -364,7 +364,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,

diff --git a/analysis_notebooks/configure_dms_viz_mabs.ipynb b/analysis_notebooks/configure_dms_viz_mabs.ipynb
@@ -0,0 +1,324 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6f4b8ac5-ff40-4449-a326-d63651e0d828",
+   "metadata": {},
+   "source": [
+    "# Configure structure based analysis for `dms-viz`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c371f68d-0e22-4188-8fca-11c78d6554d3",
+   "metadata": {},
+   "source": [
+    "Imports:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52955e16-8de2-4385-a985-932a8a6a0beb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gzip\n",
+    "import os\n",
+    "import requests\n",
+    "import subprocess\n",
+    "import textwrap\n",
+    "import warnings\n",
+    "\n",
+    "import Bio.PDB.PDBParser\n",
+    "import Bio.PDB.Polypeptide\n",
+    "\n",
+    "import matplotlib\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "import seaborn"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "27c25a7d-3439-460c-84c0-13599a4fbb80",
+   "metadata": {},
+   "source": [
+    "Define variables. This next cell is tagged `parameters` for `papermill` parameterization:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a26981c-1c55-4ed1-ab17-18ff6ca585b5",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "pdb_id = None\n",
+    "phenotypes_csv = None\n",
+    "per_antibody_escape_csv = None\n",
+    "site_numbering_map = None\n",
+    "dms_viz_json = None\n",
+    "dms_viz_sitemap = None\n",
+    "dms_viz_phenotypes = None\n",
+    "pdb_file = None\n",
+    "dms_viz_subdir = None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "36204609-74f7-488c-98dd-4da325d99f16",
+   "metadata": {},
+   "source": [
+    "Build the [sitemap](https://dms-viz.github.io/dms-viz-docs/preparing-data/data-requirements/#reference-site) used by `dms-viz`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d5bfb2d-fed2-4213-86b9-e915f4ff9511",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phenotypes = pd.read_csv(phenotypes_csv)\n",
+    "\n",
+    "sitemap = (\n",
+    "    pd.read_csv(site_numbering_map)\n",
+    "    .sort_values(\"sequential_site\")\n",
+    "    .assign(\n",
+    "        HA_chain=lambda x: x[\"HA1_HA2_H5_site\"].str.split().str[1].str[1: -1],\n",
+    "        first_ha2_site=lambda x: x.query(\"HA_chain == 'HA2'\")[\"mature_H5_site\"].min(),\n",
+    "        protein_site=lambda x: x[\"mature_H5_site\"].where(\n",
+    "            x[\"HA_chain\"] == \"HA1\",\n",
+    "            x[\"mature_H5_site\"] - x[\"first_ha2_site\"] + 1,\n",
+    "        ),\n",
+    "        chains=lambda x: x[\"HA_chain\"].map({\"HA1\": \"A\", \"HA2\": \"B\"}),\n",
+    "    )\n",
+    "    .merge(phenotypes[[\"site\", \"wildtype\"]].drop_duplicates().rename(columns={\"site\": \"reference_site\"}))\n",
+    "    [[\"sequential_site\", \"reference_site\", \"protein_site\", \"wildtype\", \"HA_chain\", \"chains\"]]\n",
+    ")\n",
+    "\n",
+    "sitemap.to_csv(dms_viz_sitemap, index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe8a3bb1-f557-43b2-9ce9-0418aaa63928",
+   "metadata": {},
+   "source": [
+    "Get the biological assembly (see https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies#Anchor-download) as the crystallographic unit doesn't correspond to that:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c2afd40-067f-462e-88d6-d3128dd57ff2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r = requests.get(f\"https://files.rcsb.org/download/{pdb_id}.pdb1.gz\")\n",
+    "assert r.status_code == 200\n",
+    "pdb_content = gzip.decompress(r.content).decode(\"utf-8\")\n",
+    "with open(pdb_file, \"w\") as f:\n",
+    "    f.write(pdb_content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6bbef2e-e52d-4735-9e8e-d5401ebb9f36",
+   "metadata": {},
+   "source": [
+    "Check the sites mismatched between the sitemap and the protein structure in terms of residue identity:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e1ec117-3166-471d-8485-602ab384b50f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with warnings.catch_warnings():\n",
+    "    warnings.simplefilter(\"ignore\")\n",
+    "    pdb_obj = Bio.PDB.PDBParser().get_structure(id=pdb_id, file=pdb_file)[0]\n",
+    "\n",
+    "records = []\n",
+    "for chain in [\"A\", \"B\"]:\n",
+    "    for res in pdb_obj[chain].get_residues():\n",
+    "        if not res.id[0].isspace():\n",
+    "            continue\n",
+    "        aa = Bio.PDB.Polypeptide.protein_letters_3to1[res.resname]\n",
+    "        r = res.id[1]\n",
+    "        records.append((chain, r, aa))\n",
+    "pdb_df = pd.DataFrame(records, columns=[\"chains\", \"protein_site\", \"pdb_aa\"])\n",
+    "\n",
+    "mismatched_sites = sitemap.merge(pdb_df, how=\"left\")\n",
+    "\n",
+    "print(\n",
+    "    f\"Of {len(sitemap)} sites, {len(mismatched_sites.query('wildtype == pdb_aa'))} match, \"\n",
+    "    f\"{len(mismatched_sites.query('pdb_aa.isnull()'))} are missing from PDB, and \"\n",
+    "    f\"{len(mismatched_sites.query('pdb_aa.notnull()').query('wildtype != pdb_aa'))} differ.\"\n",
+    ")\n",
+    "\n",
+    "print(\"Sites that differ:\")\n",
+    "display(mismatched_sites.query(\"pdb_aa.notnull() and (wildtype != pdb_aa)\").reset_index(drop=True))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f5dd45f-1627-425d-8cb2-63f5f103dcea",
+   "metadata": {},
+   "source": [
+    "Write the phenotypes after adding the antibodyd escape to a CSV file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "562538b8-d250-40a9-ab01-6468bcccafa9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "per_antibody_escape = (\n",
+    "    pd.read_csv(per_antibody_escape_csv)\n",
+    "    .drop(columns=\"antibody_set\")\n",
+    "    .merge(\n",
+    "        phenotypes[\n",
+    "            [\n",
+    "                \"site\",\n",
+    "                \"mutant\",\n",
+    "                \"entry in 293T cells\",\n",
+    "                \"sequential_site\",\n",
+    "                \"mature_H5_site\",\n",
+    "                \"HA1_HA2_H5_site\",\n",
+    "            ]\n",
+    "        ],\n",
+    "        on=[\"site\", \"mutant\"],\n",
+    "        validate=\"many_to_one\",\n",
+    "        how=\"left\",\n",
+    "    )\n",
+    "    .rename(columns={\"entry in 293T cells\": \"cell_entry\"})\n",
+    "    .assign(\n",
+    "        mutation=lambda x: x[\"wildtype\"] + x[\"site\"].astype(str) + x[\"mutant\"],\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "antibodies = list(per_antibody_escape[\"antibody\"].unique())\n",
+    "\n",
+    "print(f\"Read escape for {antibodies=}\")\n",
+    "\n",
+    "print(f\"Writing the phenotypes to {dms_viz_phenotypes}\")\n",
+    "per_antibody_escape.to_csv(dms_viz_phenotypes, index=False, float_format=\"%.4g\")\n",
+    "\n",
+    "print(f\"{per_antibody_escape.columns=}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7ff0c20-4fc0-497d-b114-1f50ea042fbb",
+   "metadata": {},
+   "source": [
+    "Get enough colors:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce9e2f01-bb4a-48e9-a99c-3d5d73da0d72",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_hex_color_palette(num_colors):\n",
+    "    colors = seaborn.color_palette(\"hls\", num_colors)\n",
+    "    hex_colors = [matplotlib.colors.to_hex(color) for color in colors]\n",
+    "    return hex_colors\n",
+    "\n",
+    "nconditions = len(antibodies)\n",
+    "if nconditions > 4:\n",
+    "    colors = \",\".join(get_hex_color_palette(nconditions))\n",
+    "else:\n",
+    "    colors = \"#0072B2,#CC79A7,#4C3549,#009E73\"\n",
+    "\n",
+    "print(f\"Using {colors=}\")\n",
+    "seaborn.palplot(seaborn.color_palette(colors.split(\",\")))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "00e1953c-bd3f-40fc-83a5-84233be74526",
+   "metadata": {},
+   "source": [
+    "Run [configure-dms-viz](https://dms-viz.github.io/dms-viz-docs/preparing-data/command-line-api/).\n",
+    "First, set up some options:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22448e4d-a621-49cb-aabc-81baaf821f88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tooltip_cols = [\n",
+    "    \"mutation\",\n",
+    "    \"sequential_site\",\n",
+    "    \"mature_H5_site\",\n",
+    "    \"cell_entry\",\n",
+    "]\n",
+    "\n",
+    "cmds = [\n",
+    "    \"configure-dms-viz\", \"format\",\n",
+    "    \"--name\", \"VRC antibodies\",\n",
+    "    \"--input\", dms_viz_phenotypes,\n",
+    "    \"--output\", dms_viz_json,\n",
+    "    \"--structure\", pdb_file,\n",
+    "    \"--metric\", \"escape\",\n",
+    "    \"--condition\", \"antibody\",\n",
+    "    \"--sitemap\", dms_viz_sitemap,\n",
+    "    \"--colors\", colors,\n",
+    "    \"--alphabet\", \"RKHDEQNSTYWFAILMVGPC\",\n",
+    "    \"--summary-stat\", \"sum\",\n",
+    "    \"--floor\", \"True\",\n",
+    "    \"--tooltip-cols\", str({c: c.replace(\"_\", \" \") for c in tooltip_cols}),\n",
+    "    \"--filter-cols\", \"{'cell_entry': 'cell entry'}\",\n",
+    "    \"--filter-limits\", f\"{{'cell_entry': [{float(per_antibody_escape['cell_entry'].min())}, -3, 0]}}\",\n",
+    "    \"--title\", \"H5 HA escape from VRC antibodies as measured by pseudovirus deep mutational scanning\",\n",
+    "    \"--description\", \"H5 HA escape from VRC antibodies as measured by pseudovirus deep mutational scanning\",\n",
+    "]\n",
+    "\n",
+    "print(f\"Running the following commands:\\n{cmds}\")\n",
+    "subprocess.run(cmds, check=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}