-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
386 additions
and
29,424 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,324 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "6f4b8ac5-ff40-4449-a326-d63651e0d828", | ||
"metadata": {}, | ||
"source": [ | ||
"# Configure structure based analysis for `dms-viz`" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c371f68d-0e22-4188-8fca-11c78d6554d3", | ||
"metadata": {}, | ||
"source": [ | ||
"Imports:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "52955e16-8de2-4385-a985-932a8a6a0beb", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import gzip\n", | ||
"import os\n", | ||
"import requests\n", | ||
"import subprocess\n", | ||
"import textwrap\n", | ||
"import warnings\n", | ||
"\n", | ||
"import Bio.PDB.PDBParser\n", | ||
"import Bio.PDB.Polypeptide\n", | ||
"\n", | ||
"import matplotlib\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"import seaborn" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "27c25a7d-3439-460c-84c0-13599a4fbb80", | ||
"metadata": {}, | ||
"source": [ | ||
"Define variables. This next cell is tagged `parameters` for `papermill` parameterization:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "0a26981c-1c55-4ed1-ab17-18ff6ca585b5", | ||
"metadata": { | ||
"editable": true, | ||
"slideshow": { | ||
"slide_type": "" | ||
}, | ||
"tags": [ | ||
"parameters" | ||
] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"pdb_id = None\n", | ||
"phenotypes_csv = None\n", | ||
"per_antibody_escape_csv = None\n", | ||
"site_numbering_map = None\n", | ||
"dms_viz_json = None\n", | ||
"dms_viz_sitemap = None\n", | ||
"dms_viz_phenotypes = None\n", | ||
"pdb_file = None\n", | ||
"dms_viz_subdir = None" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "36204609-74f7-488c-98dd-4da325d99f16", | ||
"metadata": {}, | ||
"source": [ | ||
"Build the [sitemap](https://dms-viz.github.io/dms-viz-docs/preparing-data/data-requirements/#reference-site) used by `dms-viz`:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "6d5bfb2d-fed2-4213-86b9-e915f4ff9511", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"phenotypes = pd.read_csv(phenotypes_csv)\n", | ||
"\n", | ||
"sitemap = (\n", | ||
" pd.read_csv(site_numbering_map)\n", | ||
" .sort_values(\"sequential_site\")\n", | ||
" .assign(\n", | ||
" HA_chain=lambda x: x[\"HA1_HA2_H5_site\"].str.split().str[1].str[1: -1],\n", | ||
" first_ha2_site=lambda x: x.query(\"HA_chain == 'HA2'\")[\"mature_H5_site\"].min(),\n", | ||
" protein_site=lambda x: x[\"mature_H5_site\"].where(\n", | ||
" x[\"HA_chain\"] == \"HA1\",\n", | ||
" x[\"mature_H5_site\"] - x[\"first_ha2_site\"] + 1,\n", | ||
" ),\n", | ||
" chains=lambda x: x[\"HA_chain\"].map({\"HA1\": \"A\", \"HA2\": \"B\"}),\n", | ||
" )\n", | ||
" .merge(phenotypes[[\"site\", \"wildtype\"]].drop_duplicates().rename(columns={\"site\": \"reference_site\"}))\n", | ||
" [[\"sequential_site\", \"reference_site\", \"protein_site\", \"wildtype\", \"HA_chain\", \"chains\"]]\n", | ||
")\n", | ||
"\n", | ||
"sitemap.to_csv(dms_viz_sitemap, index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "fe8a3bb1-f557-43b2-9ce9-0418aaa63928", | ||
"metadata": {}, | ||
"source": [ | ||
"Get the biological assembly (see https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies#Anchor-download) as the crystallographic unit doesn't correspond to that:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "4c2afd40-067f-462e-88d6-d3128dd57ff2", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"r = requests.get(f\"https://files.rcsb.org/download/{pdb_id}.pdb1.gz\")\n", | ||
"assert r.status_code == 200\n", | ||
"pdb_content = gzip.decompress(r.content).decode(\"utf-8\")\n", | ||
"with open(pdb_file, \"w\") as f:\n", | ||
" f.write(pdb_content)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "b6bbef2e-e52d-4735-9e8e-d5401ebb9f36", | ||
"metadata": {}, | ||
"source": [ | ||
"Check the sites mismatched between the sitemap and the protein structure in terms of residue identity:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "7e1ec117-3166-471d-8485-602ab384b50f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"with warnings.catch_warnings():\n", | ||
" warnings.simplefilter(\"ignore\")\n", | ||
" pdb_obj = Bio.PDB.PDBParser().get_structure(id=pdb_id, file=pdb_file)[0]\n", | ||
"\n", | ||
"records = []\n", | ||
"for chain in [\"A\", \"B\"]:\n", | ||
" for res in pdb_obj[chain].get_residues():\n", | ||
" if not res.id[0].isspace():\n", | ||
" continue\n", | ||
" aa = Bio.PDB.Polypeptide.protein_letters_3to1[res.resname]\n", | ||
" r = res.id[1]\n", | ||
" records.append((chain, r, aa))\n", | ||
"pdb_df = pd.DataFrame(records, columns=[\"chains\", \"protein_site\", \"pdb_aa\"])\n", | ||
"\n", | ||
"mismatched_sites = sitemap.merge(pdb_df, how=\"left\")\n", | ||
"\n", | ||
"print(\n", | ||
" f\"Of {len(sitemap)} sites, {len(mismatched_sites.query('wildtype == pdb_aa'))} match, \"\n", | ||
" f\"{len(mismatched_sites.query('pdb_aa.isnull()'))} are missing from PDB, and \"\n", | ||
" f\"{len(mismatched_sites.query('pdb_aa.notnull()').query('wildtype != pdb_aa'))} differ.\"\n", | ||
")\n", | ||
"\n", | ||
"print(\"Sites that differ:\")\n", | ||
"display(mismatched_sites.query(\"pdb_aa.notnull() and (wildtype != pdb_aa)\").reset_index(drop=True))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "9f5dd45f-1627-425d-8cb2-63f5f103dcea", | ||
"metadata": {}, | ||
"source": [ | ||
"Write the phenotypes after adding the antibodyd escape to a CSV file:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "562538b8-d250-40a9-ab01-6468bcccafa9", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"per_antibody_escape = (\n", | ||
" pd.read_csv(per_antibody_escape_csv)\n", | ||
" .drop(columns=\"antibody_set\")\n", | ||
" .merge(\n", | ||
" phenotypes[\n", | ||
" [\n", | ||
" \"site\",\n", | ||
" \"mutant\",\n", | ||
" \"entry in 293T cells\",\n", | ||
" \"sequential_site\",\n", | ||
" \"mature_H5_site\",\n", | ||
" \"HA1_HA2_H5_site\",\n", | ||
" ]\n", | ||
" ],\n", | ||
" on=[\"site\", \"mutant\"],\n", | ||
" validate=\"many_to_one\",\n", | ||
" how=\"left\",\n", | ||
" )\n", | ||
" .rename(columns={\"entry in 293T cells\": \"cell_entry\"})\n", | ||
" .assign(\n", | ||
" mutation=lambda x: x[\"wildtype\"] + x[\"site\"].astype(str) + x[\"mutant\"],\n", | ||
" )\n", | ||
")\n", | ||
"\n", | ||
"antibodies = list(per_antibody_escape[\"antibody\"].unique())\n", | ||
"\n", | ||
"print(f\"Read escape for {antibodies=}\")\n", | ||
"\n", | ||
"print(f\"Writing the phenotypes to {dms_viz_phenotypes}\")\n", | ||
"per_antibody_escape.to_csv(dms_viz_phenotypes, index=False, float_format=\"%.4g\")\n", | ||
"\n", | ||
"print(f\"{per_antibody_escape.columns=}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "e7ff0c20-4fc0-497d-b114-1f50ea042fbb", | ||
"metadata": {}, | ||
"source": [ | ||
"Get enough colors:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ce9e2f01-bb4a-48e9-a99c-3d5d73da0d72", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_hex_color_palette(num_colors):\n", | ||
" colors = seaborn.color_palette(\"hls\", num_colors)\n", | ||
" hex_colors = [matplotlib.colors.to_hex(color) for color in colors]\n", | ||
" return hex_colors\n", | ||
"\n", | ||
"nconditions = len(antibodies)\n", | ||
"if nconditions > 4:\n", | ||
" colors = \",\".join(get_hex_color_palette(nconditions))\n", | ||
"else:\n", | ||
" colors = \"#0072B2,#CC79A7,#4C3549,#009E73\"\n", | ||
"\n", | ||
"print(f\"Using {colors=}\")\n", | ||
"seaborn.palplot(seaborn.color_palette(colors.split(\",\")))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "00e1953c-bd3f-40fc-83a5-84233be74526", | ||
"metadata": {}, | ||
"source": [ | ||
"Run [configure-dms-viz](https://dms-viz.github.io/dms-viz-docs/preparing-data/command-line-api/).\n", | ||
"First, set up some options:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "22448e4d-a621-49cb-aabc-81baaf821f88", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tooltip_cols = [\n", | ||
" \"mutation\",\n", | ||
" \"sequential_site\",\n", | ||
" \"mature_H5_site\",\n", | ||
" \"cell_entry\",\n", | ||
"]\n", | ||
"\n", | ||
"cmds = [\n", | ||
" \"configure-dms-viz\", \"format\",\n", | ||
" \"--name\", \"VRC antibodies\",\n", | ||
" \"--input\", dms_viz_phenotypes,\n", | ||
" \"--output\", dms_viz_json,\n", | ||
" \"--structure\", pdb_file,\n", | ||
" \"--metric\", \"escape\",\n", | ||
" \"--condition\", \"antibody\",\n", | ||
" \"--sitemap\", dms_viz_sitemap,\n", | ||
" \"--colors\", colors,\n", | ||
" \"--alphabet\", \"RKHDEQNSTYWFAILMVGPC\",\n", | ||
" \"--summary-stat\", \"sum\",\n", | ||
" \"--floor\", \"True\",\n", | ||
" \"--tooltip-cols\", str({c: c.replace(\"_\", \" \") for c in tooltip_cols}),\n", | ||
" \"--filter-cols\", \"{'cell_entry': 'cell entry'}\",\n", | ||
" \"--filter-limits\", f\"{{'cell_entry': [{float(per_antibody_escape['cell_entry'].min())}, -3, 0]}}\",\n", | ||
" \"--title\", \"H5 HA escape from VRC antibodies as measured by pseudovirus deep mutational scanning\",\n", | ||
" \"--description\", \"H5 HA escape from VRC antibodies as measured by pseudovirus deep mutational scanning\",\n", | ||
"]\n", | ||
"\n", | ||
"print(f\"Running the following commands:\\n{cmds}\")\n", | ||
"subprocess.run(cmds, check=True)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.