Skip to content

Commit

Permalink
Add BCL2/Ced9, CD47/P66 examples
Browse files Browse the repository at this point in the history
  • Loading branch information
olgabot committed Dec 6, 2024
1 parent 0bd6db8 commit 104e6fa
Show file tree
Hide file tree
Showing 24 changed files with 5,644 additions and 2,000 deletions.
265 changes: 142 additions & 123 deletions notebooks/05-scan-s3-csv-to-s3-parquet.ipynb

Large diffs are not rendered by default.

324 changes: 212 additions & 112 deletions notebooks/13-compute-all-sensitivity-to-first-fp.ipynb

Large diffs are not rendered by default.

437 changes: 321 additions & 116 deletions notebooks/14-plot-sensitivity-to-first-fp.ipynb

Large diffs are not rendered by default.

1,673 changes: 462 additions & 1,211 deletions notebooks/15-which-kmers-match.ipynb

Large diffs are not rendered by default.

3,940 changes: 3,940 additions & 0 deletions notebooks/16-ced9-bcl2-and-p66-cd47.ipynb

Large diffs are not rendered by default.

206 changes: 206 additions & 0 deletions notebooks/17-explore-hp-k10.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "103ccd5c-4285-4267-99c3-2110066a0661",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "0f4cd62d-9a46-405c-b38b-12df6d8158fc",
"metadata": {},
"source": [
"# Imports"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c32851e0-276d-483a-8d6c-15a42d5dab0e",
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-07T21:58:24.482905Z",
"iopub.status.busy": "2024-11-07T21:58:24.482454Z",
"iopub.status.idle": "2024-11-07T21:58:24.497152Z",
"shell.execute_reply": "2024-11-07T21:58:24.496791Z",
"shell.execute_reply.started": "2024-11-07T21:58:24.482890Z"
}
},
"outputs": [],
"source": [
"\n",
"import os\n",
"import polars as pl\n",
"import seaborn as sns\n",
"\n",
"# Handwritten local modules\n",
"from sig2kmer import degenerate_protein_chatgpt"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "a782a96b-cac4-4520-8e60-cd30de5272d3",
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-07T21:58:24.846718Z",
"iopub.status.busy": "2024-11-07T21:58:24.846470Z",
"iopub.status.idle": "2024-11-07T21:59:09.700968Z",
"shell.execute_reply": "2024-11-07T21:59:09.700668Z",
"shell.execute_reply.started": "2024-11-07T21:58:24.846705Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<h4>NAIVE QUERY PLAN</h4><p>run <b>LazyFrame.show_graph()</b> to see the optimized version</p><?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 2.44.0 (0)\n",
" -->\n",
"<!-- Title: polars_query Pages: 1 -->\n",
"<svg width=\"939pt\" height=\"46pt\"\n",
" viewBox=\"0.00 0.00 939.00 46.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 42)\">\n",
"<title>polars_query</title>\n",
"<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-42 935,-42 935,4 -4,4\"/>\n",
"<!-- p1 -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>p1</title>\n",
"<polygon fill=\"none\" stroke=\"black\" points=\"931,-38 0,-38 0,0 931,0 931,-38\"/>\n",
"<text text-anchor=\"middle\" x=\"465.5\" y=\"-22.8\" font-family=\"Times-Roman\" font-size=\"14.00\">Parquet SCAN [s3://seanome&#45;kmerseek/scope&#45;benchmark/analysis&#45;outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k10.filtered.pq]</text>\n",
"<text text-anchor=\"middle\" x=\"465.5\" y=\"-7.8\" font-family=\"Times-Roman\" font-size=\"14.00\">π */38;</text>\n",
"</g>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<LazyFrame at 0x7FBEBCB57AA0>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pq = \"s3://seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results/scope40.multisearch.hp.k10.filtered.pq\"\n",
"multisearch = pl.scan_parquet(pq)\n",
"multisearch"
]
},
{
"cell_type": "markdown",
"id": "053f1508-3e5b-4cf9-84cc-2d43c131d329",
"metadata": {},
"source": [
"# Read in data"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "74c36a49-fcf4-4b37-8167-2da125057d4d",
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-07T21:59:09.733249Z",
"iopub.status.busy": "2024-11-07T21:59:09.732985Z",
"iopub.status.idle": "2024-11-07T21:59:09.735189Z",
"shell.execute_reply": "2024-11-07T21:59:09.734951Z",
"shell.execute_reply.started": "2024-11-07T21:59:09.733238Z"
}
},
"outputs": [],
"source": [
"ksize = 10\n",
"moltype = \"hp\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7b46f44-ac7e-4449-aa6f-609e9b5787e0",
"metadata": {
"execution": {
"iopub.execute_input": "2024-11-07T21:59:09.735689Z",
"iopub.status.busy": "2024-11-07T21:59:09.735466Z"
}
},
"outputs": [],
"source": [
"cleaned_multisearch_folder = \"/home/ec2-user/data/seanome-kmerseek/scope-benchmark/analysis-outputs/hp/00_cleaned_multisearch_results\"\n",
"\n",
"pq = os.path.join(\n",
" cleaned_multisearch_folder, f\"scope40.multisearch.{moltype}.k{ksize}.filtered.pq\"\n",
")\n",
"multisearch = pl.scan_parquet(pq)\n",
"multisearch"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36fe4ff2-81c3-4581-85b0-4e431eff2641",
"metadata": {},
"outputs": [],
"source": [
"multisearch.filter(pl.col(\"query_scop_id\") == \"d1ty4a_\")"
]
},
{
"cell_type": "markdown",
"id": "90495133-879b-4fab-b17c-3f3a16fd1a50",
"metadata": {},
"source": [
"### Filter on query family [\"Family f.1.4.1: Bcl-2 inhibitors of programmed cell death\"](https://scop.berkeley.edu/sunid=56855)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ddf848ee-a3f5-4510-b786-1fc935905854",
"metadata": {},
"outputs": [],
"source": [
"bcl2_family = \"f.1.4.1\"\n",
"\n",
"multisearch_bcl2 = multisearch.filter((pl.col(\"query_family\") == bcl2_family)).collect()\n",
"multisearch_bcl2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "455a4aa9-aaef-428c-81df-8977563d6389",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:2024-kmerseek-analysis-polars]",
"language": "python",
"name": "conda-env-2024-kmerseek-analysis-polars-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 104e6fa

Please sign in to comment.