Skip to content

Commit

Permalink
Add notebook for memory optimized polars
Browse files Browse the repository at this point in the history
  • Loading branch information
olgabot committed Oct 31, 2024
1 parent 8909112 commit 1ad3b21
Show file tree
Hide file tree
Showing 9 changed files with 2,955 additions and 482 deletions.
159 changes: 99 additions & 60 deletions notebooks/05-scan-s3-csv-to-s3-parquet.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,17 @@
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 1,
"id": "74768072-fc94-4e7f-82a7-5cfb6ce55bba",
"metadata": {},
"metadata": {
"execution": {
"iopub.execute_input": "2024-10-31T16:39:14.410387Z",
"iopub.status.busy": "2024-10-31T16:39:14.410074Z",
"iopub.status.idle": "2024-10-31T16:39:15.223212Z",
"shell.execute_reply": "2024-10-31T16:39:15.222877Z",
"shell.execute_reply.started": "2024-10-31T16:39:14.410372Z"
}
},
"outputs": [],
"source": [
"import boto3\n",
Expand All @@ -20,88 +28,75 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 2,
"id": "76c826c2-1902-45b8-bad3-566e3ce8385d",
"metadata": {},
"metadata": {
"execution": {
"iopub.execute_input": "2024-10-31T16:39:15.223845Z",
"iopub.status.busy": "2024-10-31T16:39:15.223729Z",
"iopub.status.idle": "2024-10-31T16:39:15.360680Z",
"shell.execute_reply": "2024-10-31T16:39:15.360374Z",
"shell.execute_reply.started": "2024-10-31T16:39:15.223833Z"
}
},
"outputs": [],
"source": [
"s3 = boto3.resource(\"s3\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 3,
"id": "7d934996-2e36-4bb3-9bc4-be6068f09a6d",
"metadata": {},
"metadata": {
"execution": {
"iopub.execute_input": "2024-10-31T16:39:16.083670Z",
"iopub.status.busy": "2024-10-31T16:39:16.083397Z",
"iopub.status.idle": "2024-10-31T16:39:16.085593Z",
"shell.execute_reply": "2024-10-31T16:39:16.085336Z",
"shell.execute_reply.started": "2024-10-31T16:39:16.083657Z"
}
},
"outputs": [],
"source": [
"# !pip install boto3"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"id": "2ed3dc68-0eb5-4504-9097-1c38f72090cd",
"metadata": {},
"metadata": {
"execution": {
"iopub.execute_input": "2024-10-31T16:39:16.452928Z",
"iopub.status.busy": "2024-10-31T16:39:16.452614Z",
"iopub.status.idle": "2024-10-31T16:39:16.454888Z",
"shell.execute_reply": "2024-10-31T16:39:16.454624Z",
"shell.execute_reply.started": "2024-10-31T16:39:16.452915Z"
}
},
"outputs": [],
"source": [
"csv = \"s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.csv\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 5,
"id": "80ddae7b-8598-45ec-99e7-98332ecc26b5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/tmp/lecdzqve: 100%|█████████████████████████████████████████████████████████████████████████████████████| 94.4G/94.4G [24:20<00:00, 64.6MB/s]\n",
"/home/ec2-user/2024-kmerseek-analysis/notebooks/polars_utils.py:40: PerformanceWarning: Determining the column names of a LazyFrame requires resolving its schema, which is a potentially expensive operation. Use `LazyFrame.collect_schema().names()` to get the column names without this warning.\n",
" f\"\\nWriting {df.select(pl.len()).collect().item()} rows and {len(df.columns)} columns to {pq} ...\"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Writing 225197519 rows and 13 columns to s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.pq ...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"RUN STREAMING PIPELINE\n",
"[csv -> parquet_sink]\n",
"STREAMING CHUNK SIZE: 3846 rows\n",
"/tmp/tmpnxp98qit: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 11.9G/11.9G [00:45<00:00, 262MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\tDone.\n"
]
},
{
"data": {
"text/plain": [
"'s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.pq'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
"metadata": {
"execution": {
"iopub.execute_input": "2024-10-31T16:39:19.151248Z",
"iopub.status.busy": "2024-10-31T16:39:19.150921Z",
"iopub.status.idle": "2024-10-31T16:39:19.153084Z",
"shell.execute_reply": "2024-10-31T16:39:19.152825Z",
"shell.execute_reply.started": "2024-10-31T16:39:19.151233Z"
}
],
},
"outputs": [],
"source": [
"pq = scan_csv_sink_parquet(csv, verbose=True)\n",
"pq"
"# apq = scan_csv_sink_parquet(csv, verbose=True)\n",
"# pq"
]
},
{
Expand All @@ -114,9 +109,17 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 6,
"id": "c92cacc5-4397-4070-80eb-b49da969cde9",
"metadata": {},
"metadata": {
"execution": {
"iopub.execute_input": "2024-10-31T16:39:19.868261Z",
"iopub.status.busy": "2024-10-31T16:39:19.867882Z",
"iopub.status.idle": "2024-10-31T16:39:20.509199Z",
"shell.execute_reply": "2024-10-31T16:39:20.508774Z",
"shell.execute_reply.started": "2024-10-31T16:39:19.868248Z"
}
},
"outputs": [
{
"name": "stdout",
Expand All @@ -130,6 +133,42 @@
"! aws s3 ls --human-readable $csv"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "963651a6-eaa6-467a-a843-fa183c2bcf1b",
"metadata": {
"execution": {
"iopub.execute_input": "2024-10-31T16:39:21.119473Z",
"iopub.status.busy": "2024-10-31T16:39:21.119198Z",
"iopub.status.idle": "2024-10-31T16:39:22.505407Z",
"shell.execute_reply": "2024-10-31T16:39:22.504976Z",
"shell.execute_reply.started": "2024-10-31T16:39:21.119457Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes,prob_overlap,prob_overlap_adjusted,containment_adjusted,containment_adjusted_log10,tf_idf_score\n",
"d1dlwa_ a.1.1.1 (A:) Protozoan/bacterial hemoglobin {Ciliate (Paramecium caudatum) [TaxId: 5885]},26fce7d25101257fca79d052ec7a19c5,d3gcea_ b.33.1.0 (A:) automated matches {Nocardioides aromaticivorans [TaxId: 200618]},400e6c363be8bb5a88c54928073b54c6,0.13333333333333333,0.175,0.08187134502923976,14.0,0.00002055079627314157,4733.697725563677,0.000028166845680341016,-4.5502617856136185,0.3422001829979208\n",
"\"d2gkma_ a.1.1.1 (A:) Protozoan/bacterial hemoglobin {Mycobacterium tuberculosis, HbN [TaxId: 1773]}\",8e68936d4b4c7df11076541fdc12818d,d3gcea_ b.33.1.0 (A:) automated matches {Nocardioides aromaticivorans [TaxId: 200618]},400e6c363be8bb5a88c54928073b54c6,0.18018018018018017,0.25,0.11695906432748537,20.0,0.000026938530809718366,6205.056988017975,0.000029037635033507323,-4.537038757590477,0.47784874417789136\n",
"\"d1ngka_ a.1.1.1 (A:) Protozoan/bacterial hemoglobin {Mycobacterium tuberculosis, HbO [TaxId: 1773]}\",ee99ff865a181da365fd2b7e380d1ef4,d3gcea_ b.33.1.0 (A:) automated matches {Nocardioides aromaticivorans [TaxId: 200618]},400e6c363be8bb5a88c54928073b54c6,0.07142857142857142,0.1,0.043478260869565216,8.0,0.000011271451767976541,2596.281179995116,0.0000275118781351355,-4.560479760904981,0.17213909439405517\n",
"d2bkma_ a.1.1.1 (A:) automated matches {Geobacillus stearothermophilus [TaxId: 1422]},971fec7764919240ea56550590c7c09b,d3gcea_ b.33.1.0 (A:) automated matches {Nocardioides aromaticivorans [TaxId: 200618]},400e6c363be8bb5a88c54928073b54c6,0.15178571428571427,0.2125,0.09714285714285714,17.0,0.000021502632536317173,4952.944955413938,0.00003064554838627899,-4.513632602766626,0.39439446722281774\n",
"d4i0va_ a.1.1.1 (A:) automated matches {Synechococcus sp. [TaxId: 32049]},3365a28682703d9c7f67c734f1bb7479,d3gcea_ b.33.1.0 (A:) automated matches {Nocardioides aromaticivorans [TaxId: 200618]},400e6c363be8bb5a88c54928073b54c6,0.06422018348623854,0.0875,0.038461538461538464,7.0,8.8397814688197e-6,2036.1670115975016,0.0000315397426244784,-4.501141854990149,0.15627294056101132\n",
"\"d1asha_ a.1.1.2 (A:) Ascaris hemoglobin, domain 1 {Pig roundworm (Ascaris suum) [TaxId: 6253]}\",26da02c0e70056ef07230b49a6f0d26a,d3gcea_ b.33.1.0 (A:) automated matches {Nocardioides aromaticivorans [TaxId: 200618]},400e6c363be8bb5a88c54928073b54c6,0.11666666666666667,0.175,0.07526881720430108,14.0,0.000016476975176647633,3795.328360089026,0.000030739544934639085,-4.512302566043756,0.2384115522230015\n",
"d2dc3a_ a.1.1.2 (A:) Cytoglobin {Human (Homo sapiens) [TaxId: 9606]},2191fc99da1820f26c5406df5bcdb63b,d3gcea_ b.33.1.0 (A:) automated matches {Nocardioides aromaticivorans [TaxId: 200618]},400e6c363be8bb5a88c54928073b54c6,0.09523809523809523,0.175,0.06572769953051644,14.0,0.000023014618989600136,5301.217924493132,0.000017965323552926996,-4.745564956806928,0.2032214802329215\n",
"d4hswa_ a.1.1.2 (A:) Dehaloperoxidase {Amphitrite ornata [TaxId: 129555]},0fec36f953b94fde3ee58d82f89ef3b5,d3gcea_ b.33.1.0 (A:) automated matches {Nocardioides aromaticivorans [TaxId: 200618]},400e6c363be8bb5a88c54928073b54c6,0.11382113821138211,0.175,0.07407407407407407,14.0,0.000018564433442604427,4276.156271301549,0.00002661762830682003,-4.5748306437955195,0.2788725157353147\n",
"\"d1ecaa_ a.1.1.2 (A:) Erythrocruorin {Midge (Chironomus thummi thummi), fraction III [TaxId: 7154]}\",fa54f6bd9a5ce6715ecf3f217f488ffe,d3gcea_ b.33.1.0 (A:) automated matches {Nocardioides aromaticivorans [TaxId: 200618]},400e6c363be8bb5a88c54928073b54c6,0.14049586776859505,0.2125,0.09239130434782608,17.0,0.000023956239480141222,5518.112039697998,0.000025460858126447952,-4.594126963082179,0.351130003598395\n",
"download failed: s3://seanome-kmerseek/scope-benchmark/pipeline-outputs/2024-10-09__hp_k20-60/sourmash/multisearch/astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa--in--astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.part_001.fa.hp.10.multisearch.csv to - [Errno 32] Broken pipe\n"
]
}
],
"source": [
"! aws s3 cp $csv - | head"
]
},
{
"cell_type": "code",
"execution_count": 15,
Expand Down Expand Up @@ -432,7 +471,7 @@
"metadata": {},
"outputs": [],
"source": [
"1+1"
"1 + 1"
]
},
{
Expand Down
Loading

0 comments on commit 1ad3b21

Please sign in to comment.