diff --git a/notebooks/16-ced9-bcl2-and-p66-cd47-dayhoff (1).ipynb b/notebooks/16-ced9-bcl2-and-p66-cd47-dayhoff (1).ipynb new file mode 100644 index 0000000..1c04381 --- /dev/null +++ b/notebooks/16-ced9-bcl2-and-p66-cd47-dayhoff (1).ipynb @@ -0,0 +1,3982 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ec46447c-5e3e-4d2c-9e28-5d1f7860deaf", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "02d8e1bf-6ca0-40a0-9ef3-af317592cc82", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:31.268053Z", + "iopub.status.busy": "2024-11-12T21:31:31.267726Z", + "iopub.status.idle": "2024-11-12T21:31:32.639537Z", + "shell.execute_reply": "2024-11-12T21:31:32.639214Z", + "shell.execute_reply.started": "2024-11-12T21:31:31.268039Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "# Handwritten local modules\n", + "from sig2kmer import degenerate_protein_chatgpt\n", + "\n", + "import sourmash" + ] + }, + { + "cell_type": "markdown", + "id": "fc6e9b32-0acd-4727-9ba3-261edafe783e", + "metadata": {}, + "source": [ + "## Try aligning dayhoff versions of CED9 and BCL2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e63be64c-3bd8-4bea-bf3c-2284addf8046", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:32.640388Z", + "iopub.status.busy": "2024-11-12T21:31:32.640079Z", + "iopub.status.idle": "2024-11-12T21:31:32.642225Z", + "shell.execute_reply": "2024-11-12T21:31:32.641997Z", + "shell.execute_reply.started": "2024-11-12T21:31:32.640376Z" + } + }, + "outputs": [], + "source": [ + "ced9_seq = \"MTRCTADNSLTNPAYRRRTMATGEMKEFLGIKGTEPTDFGINSDAQDLPSPSRQASTRRMSIGESIDGKINDWEEPRLDIEGFVVDYFTHRIRQNGMEWFGAPGLPCGVQPEHEMMRVMGTIFEKKHAENFETFCEQLLAVPRISFSLYQDVVRTVGNAQTDQCPMSYGRLIGLISFGGFVAAKMMESVELQGQVRNLFVYTSLFIKTRIRNNWKEHNRSWDDFMTLGKQMKEDYERAEAEKVGRRKQNRRWSMIGAGVTAGAIGIVGVVVCGRMMFSLK\"\n", + "bcl2_seq = \"MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAPGIFSSQPGHTPHPAASRDPVARTSPLQTPAAPGAAAGPALSPVPPVVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYGPSMRPLFDFSWLSLKTLLSLALVGACITLGAYLGHK\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1408cb59-5dfb-4d13-843e-8b1deda6e0a0", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:32.642586Z", + "iopub.status.busy": "2024-11-12T21:31:32.642487Z", + "iopub.status.idle": "2024-11-12T21:31:32.649951Z", + "shell.execute_reply": "2024-11-12T21:31:32.649731Z", + "shell.execute_reply.started": "2024-11-12T21:31:32.642578Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'ebdabbccbebcbbfdddbebbbcedcfebedbbcbbcfbecbcbccebbbbdcbbbddebebcbecbdeccfccbdececbfeecffbddedccbecffbbbbebabecbcdceedeebbefcdddbccfcbfacceebebdebfbefcceedbebcbcbccabebfbdeebeebfbbfebbdeecbececbcedcefefbbefedbdedccfdcdcdbfccfebebdcedccfcdbcbcdebdddccddfbeebbbebbbbebeebeeeabdeefbed'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ced9_seq_dayhoff = degenerate_protein_chatgpt(ced9_seq, \"dayhoff\")\n", + "ced9_seq_dayhoff" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a6bc3196-4903-4b1f-9c75-aa543227691f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:32.650626Z", + "iopub.status.busy": "2024-11-12T21:31:32.650521Z", + "iopub.status.idle": "2024-11-12T21:31:32.654521Z", + "shell.execute_reply": "2024-11-12T21:31:32.654306Z", + "shell.execute_reply.started": "2024-11-12T21:31:32.650617Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'ebdbbdbbfccdceeedfedfdebcdbfcfcbbcebbbbbbbbbbbbefbbcbbdbbdbbbbdcbebdbbbecbbbbbbbbbbbbebbebbeedebedcbbccfbddfddcfbcebbcedebbfbbdbdfbbeeccefdcbecfbdeebffcfbbeeaecbecdcebbeeccebefebcfecddedbfecccbbfcbfecefbbbedbefcfbfebedbeebebeebbaebebbfebdd'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bcl2_seq_dayhoff = degenerate_protein_chatgpt(bcl2_seq, \"dayhoff\")\n", + "bcl2_seq_dayhoff" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7be79de6-4856-4cdd-99c5-2e6b1902fd51", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:32.654897Z", + "iopub.status.busy": "2024-11-12T21:31:32.654799Z", + "iopub.status.idle": "2024-11-12T21:31:32.658857Z", + "shell.execute_reply": "2024-11-12T21:31:32.658627Z", + "shell.execute_reply.started": "2024-11-12T21:31:32.654888Z" + } + }, + "outputs": [], + "source": [ + "def kmerize(sequence, ksize):\n", + " kmers = [sequence[i : (i + ksize)] for i in range(len(sequence) - ksize + 1)]\n", + " return kmers\n", + "\n", + "\n", + "def calculate_jaccard(set1, set2):\n", + " union = set1.union(set2)\n", + " intersection = set1.intersection(set2)\n", + " return len(intersection) / len(union)" + ] + }, + { + "cell_type": "markdown", + "id": "07af846e", + "metadata": {}, + "source": [ + "# The ksizes value was dropped from (5,31) using the hp dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "df3a181f-6ec3-429e-86a2-d2f7f490852a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:32.659221Z", + "iopub.status.busy": "2024-11-12T21:31:32.659126Z", + "iopub.status.idle": "2024-11-12T21:31:32.675748Z", + "shell.execute_reply": "2024-11-12T21:31:32.675523Z", + "shell.execute_reply.started": "2024-11-12T21:31:32.659212Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
querymatchmoltypeksizejaccardquery_n_kmersquery_n_unique_kmersmatch_n_kmersmatch_n_unique_kmers
0BCL2ced9dayhoff20.8181822382827932
1BCL2ced9dayhoff30.606061237100278112
2BCL2ced9dayhoff40.198113236172277209
3BCL2ced9dayhoff50.055046235204276256
4BCL2ced9dayhoff60.016667234215275273
5BCL2ced9dayhoff70.002033233219274274
6BCL2ced9dayhoff80.000000232222273273
7BCL2ced9dayhoff90.000000231223272272
8BCL2ced9dayhoff100.000000230224271271
9BCL2ced9dayhoff110.000000229225270270
10BCL2ced9dayhoff120.000000228226269269
11BCL2ced9dayhoff130.000000227226268268
12BCL2ced9dayhoff140.000000226226267267
\n", + "
" + ], + "text/plain": [ + " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n", + "0 BCL2 ced9 dayhoff 2 0.818182 238 28 \n", + "1 BCL2 ced9 dayhoff 3 0.606061 237 100 \n", + "2 BCL2 ced9 dayhoff 4 0.198113 236 172 \n", + "3 BCL2 ced9 dayhoff 5 0.055046 235 204 \n", + "4 BCL2 ced9 dayhoff 6 0.016667 234 215 \n", + "5 BCL2 ced9 dayhoff 7 0.002033 233 219 \n", + "6 BCL2 ced9 dayhoff 8 0.000000 232 222 \n", + "7 BCL2 ced9 dayhoff 9 0.000000 231 223 \n", + "8 BCL2 ced9 dayhoff 10 0.000000 230 224 \n", + "9 BCL2 ced9 dayhoff 11 0.000000 229 225 \n", + "10 BCL2 ced9 dayhoff 12 0.000000 228 226 \n", + "11 BCL2 ced9 dayhoff 13 0.000000 227 226 \n", + "12 BCL2 ced9 dayhoff 14 0.000000 226 226 \n", + "\n", + " match_n_kmers match_n_unique_kmers \n", + "0 279 32 \n", + "1 278 112 \n", + "2 277 209 \n", + "3 276 256 \n", + "4 275 273 \n", + "5 274 274 \n", + "6 273 273 \n", + "7 272 272 \n", + "8 271 271 \n", + "9 270 270 \n", + "10 269 269 \n", + "11 268 268 \n", + "12 267 267 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ksizes = range(2, 15)\n", + "\n", + "lines = []\n", + "\n", + "for ksize in ksizes:\n", + " bcl2_dayhoff_kmers = kmerize(bcl2_seq_dayhoff, ksize)\n", + " ced9_dayhoff_kmers = kmerize(ced9_seq_dayhoff, ksize)\n", + "\n", + " bcl2_dayhoff_kmers_set = set(bcl2_dayhoff_kmers)\n", + " ced9_dayhoff_kmers_set = set(ced9_dayhoff_kmers)\n", + "\n", + " jaccard = calculate_jaccard(bcl2_dayhoff_kmers_set, ced9_dayhoff_kmers_set)\n", + " lines.append(\n", + " [\n", + " \"BCL2\",\n", + " \"ced9\",\n", + " \"dayhoff\",\n", + " ksize,\n", + " jaccard,\n", + " len(bcl2_dayhoff_kmers),\n", + " len(bcl2_dayhoff_kmers_set),\n", + " len(ced9_dayhoff_kmers),\n", + " len(ced9_dayhoff_kmers_set),\n", + " ]\n", + " )\n", + "jaccard_df = pd.DataFrame(\n", + " lines,\n", + " columns=[\n", + " \"query\",\n", + " \"match\",\n", + " \"moltype\",\n", + " \"ksize\",\n", + " \"jaccard\",\n", + " \"query_n_kmers\",\n", + " \"query_n_unique_kmers\",\n", + " \"match_n_kmers\",\n", + " \"match_n_unique_kmers\",\n", + " ],\n", + ")\n", + "jaccard_df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2fe0e852-a6e8-4b77-abf2-4081fc9ca998", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:32.676166Z", + "iopub.status.busy": "2024-11-12T21:31:32.676066Z", + "iopub.status.idle": "2024-11-12T21:31:32.809486Z", + "shell.execute_reply": "2024-11-12T21:31:32.809211Z", + "shell.execute_reply.started": "2024-11-12T21:31:32.676157Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAANAAAADQCAYAAAB2pO90AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAANg0lEQVR4nO3df5BdZ13H8ffn0sA6+QEl2aQOJYSMoUCUSXG1HUg1pdVZMhihdEJbGWGmEmHA1Ak64oA1A9YRGTNaLQqMhYgWTP1R4hgLnUwKVdLS7TRWEqGtYVtTSTZZ2uZHXdh6v/5xzm63y+7es/vcc8+evZ/XzJ29P55797udfnLOee45z1cRgZnNTaPqAszqzAEyS+AAmSVwgMwSOEBmCWoXoP7+/gB8863TtynVLkCnTp2qugSzcbULkNl84gCZJXCAzBKcV3UB7dBsBoPD5zhxeoRVy3pYs3wxjYaqLsu6QO0D1GwGdx4+zo49hxgZbdKzqMGurRvoX3+BQ2Slq/0u3ODwufHwAIyMNtmx5xCDw+cqrsy6QakBktQv6duSHpX0oSleXy3pgKQHJT0kafNsf8eJ0yPj4RkzMtpk6MxIQuVmxZQWIEkvAG4B3gy8FrhW0msnDfsIsCciLgauAT4529+zalkPPYue/2f0LGqwcmnPnOo2m40yt0A/DTwaEUcj4gfAF4FfnDQmgGX5/RcD/zPbX7Jm+WJ2bd0wHqKxY6A1yxfPvXKzgsqcRHgZ8N8THh8DLpk0ZifwFUm/BiwGrpzqgyRtA7YBrF69+nmvNRqif/0FvHr7ZQydGWHlUs/CWedUPYlwLfC5iLgQ2Ax8XtIP1RQRn46Ivojo6+3t/aEPaTTE2t4lXLp2BWt7lzg81jFlBugJ4OUTHl+YPzfR9cAegIg4CPQAK0qsyaytygzQ/cA6Sa+U9EKySYK9k8Y8DlwBIOk1ZAE6WWJNZm1VWoAi4lngA8CXgf8km207LOmjkrbkwz4IvEfSvwNfAN4dXuXEakR1+/+1r68vBgYGqi7Dus+UB9ZVTyKY1ZoDZJbAATJL4ACZJXCAzBI4QGYJHCCzBA6QWQIHyCyBA2SWwAEyS+AAmSVwgMwSOEBmCRwgswQOkFkCB8gsgQNklsABMkvgAJklcIDMEjhAZgkcILMEDpBZAgfILIEDZJag9k2G58Jdva1dKu2Rmo/ZKumIpMOSbiuzHniuq/fmm+/h2s/cx+ab7+HOw8dpNuu1RrjND5X2SJW0Dvht4I0RsR749bLqGeOu3tZOVfdIfQ9wS0Q8CRARQyXWA7irt7VXmQGaqkfqyyaNeRXwKkn/JuleSf1TfZCkbZIGJA2cPJnWf8tdva2dqp6FOw9YB2wi65f6GUkvmTyoVY/U2XBXb2unMmfhivRIPQbcFxGjwHckPUwWqPvLKspdva2dqu6RegfZ1gdJK8h26Y6WWBPgrt7WPlX3SP0yMCzpCHAA+M2IGC6rJrN2c49Us2LcI9Ws3RwgswQOkFkCB8gsgQNklsABMkvgAJklcIDMEjhAZgkcILMEDpBZAgfILIEDZJZg2gvqJP0pMO2p2hGxvZSKzGpkpi3QAPAA0AO8Hngkv20AXlh6ZWY1MO0WKCJ2A0h6H7Axv0AOSX8B3NOZ8szmtyLHQOcDyyY8XpI/Z9b1iiwq8gfAg5IOkF2V9zPAzjKLMquLGQMkqQF8G7gkvwH8VkQcL7swszqYMUAR0ZR0S0RcDHypQzWZ1UaRY6D9kt4uyWs/mU1SJEC/CtwOfF/SaUlnJJ0uuS6zWmg5iRARSztRiFkdFVraV9L5ZEvujq/AHhFfK6sos7poGSBJvwLcQLa29SHgUuAg8KZSKzOrgSLHQDcAPwU8FhGXAxcDT5VZlFldFAnQSESMAEh6UUR8C7io3LLM6qHIMdCxvGfPHcBdkp4EHiuzKLO6aLkFioi3RcRTEbET+B3gL4G3FvnwIk2G83FvlxSS+grWbTYvtAyQpEslLQWIiK8Cd5MdB7V6X8smw/m4pWTHWffNqnKzeaDIMdCfA2cnPD6bP9dKkSbDAB8DPg64y6/VTpEAKSY0EYqIJsWOnVo2GZb0euDlEfHPMxbQxibDZu1UJEBHJW2XtCi/3UAb2jDmZ3rvAj7Yamw7mwybtVORAL0XeANZg+BjZJc1bCvwvlZNhpcCPw7cLWmQ7AvavZ5IsDopci7cEFmD4NkabzJMFpxrgOsmfO7TwIqxx5LuBn4jIty/0WqjyCzc7vx7oLHH50u6tdX7CjYZNqu1IpMBr4uIp8YeRMSTklpOY+dj9wH7Jj134zRjNxX5TLP5pMgxUCM/GxsASS+l4FncZgtdkSD8EXBQ0u1ki4pcDdxUalVmNVFkEuGvJD0AXJ4/dVVEHCm3LLN6KLQrlh/8nyS/oE7S6oh4vNTKzGqgyCzcFkmPAN8BvgoMAv9Scl1mtVBkEuFjZF9yPhwRrwSuAO4ttSqzmigSoNGIGCabjWtExAHAZwuYUewY6ClJS8gWlP8bSUPAuXLLMquHIlugLcAzZNfs3Ak8CrylzKLM6mKmBlv/GhEbgRM812hrbHXS35P0PeATEfHJkms0m7dm6g+0Mf855cKKkpYDXwccIOtac+6Rmk8sbGpfKWb1k9RkOCK+265CzOrIXbrNEjhAZgkcILMEDpBZAgfILIEDZJbAATJL4ACZJXCAzBI4QGYJHCCzBA6QWQIHyCyBA2SWoNQAteqRKmmHpCOSHpK0X9IryqxnrprN4OjJsxz8r1McPXmWZjNav8m6QmlrXE/okfpzZH2F7pe0d9Kqpg8CfRHxjKT3AX8IvKOsmuai2QzuPHycHXsOMTLapGdRg11bN9C//gIaDbX+AFvQytwCteyRGhEHIuKZ/OG9ZE245pXB4XPj4QEYGW2yY88hBoe9MJGVG6CWPVInuZ5pVjytskfqidMj4+EZMzLaZOiMeyLbPJlEkPROssUaPzHV61X2SF21rIeeRc//z9SzqMHKpT0drcPmpzID1KpHKgCSrgQ+DGyJiO+XWM+crFm+mF1bN4yHaOwYaM3yxRVXZvNBmY2yZuyRCpB3uvsU0J/3Yp13Gg3Rv/4CXr39MobOjLByaQ9rli/2BIIBJQYoIp6VNNYj9QXArWM9UoGBiNhLtsu2BLhdEsDjETHv+qc2GmJt7xLW9i6puhSbZxRRr+80+vr6YmDAjbyt46bc5ZgXkwhmdeUAmSVwgMwSOEBmCRwgswQOkFkCB8gsgQNklsABMkvgAJklcIDMEjhAZgkcILMEDpBZAgfILIEDZJbAATJLUOaaCF2r2QwGh89x4vQIq5Z5DYWFzAFqM69k2l28C9dmXsm0uzhAbeaVTLuLA9RmXsm0uzhAbeaVTLuLJxHazCuZdhcHqASzXcnU09715QBVzNPe9eZjoIp52rvequ6R+iJJf5u/fp+kNWXWMx/NZdp7tj1bPX768an9b6vukXo98GRE/Jika4CPM896pJZtbNp7Yohmmvae7S6fx08/vh27z5X2SM0f787v/x1whfI+J91ittPes93l8/jpx7dj97nqHqnjYyLiWeBpYPnkD6qyR2rZxqa9922/jC9uu4R92y+b8V/A2e7yefz049tx1kgtJhGq7JHaCWPT3peuXcHa3iUz7j7M9kwHj59+fDvOGqm6R+r4GEnnAS8GhkusqfZmu8vn8dOPb8dZI6V1qMsD8TBwBVlQ7geui4jDE8a8H/iJiHhvPolwVURsnelz3aHuuS9ei57p4PHTj5/F2KmfLLPFo6TNwB/zXI/Umyb2SJXUA3weuBj4HnBNRByd6TMdIKtI5wNUBgfIKuIeqWbt5gCZJajdLpykk8BjVddR0ArgVNVFdNBC/ntPRUT/5CdrF6A6kTQQEX1V19Ep3fb3gnfhzJI4QGYJHKByfbrqAjqs2/5eHwOZpfAWyCyBA2SWwAEqiaRBSf8h6ZCkBXfukaRbJQ1J+uaE514q6S5Jj+Q/z6+yxk5wgMp1eURsWKDfjXwOmPzF4oeA/RGxDtifP17QHCCbk4j4GtkZ9BNNvER/N/DWTtZUBQeoPAF8RdIDkrZVXUyHrIqI7+b3jwOrqiymE7ywYnk2RsQTklYCd0n6Vv6vdleIiJC04L8j8RaoJBHxRP5zCPhHslWKFroTkn4UIP85VHE9pXOASiBpsaSlY/eBnwe+OfO7FoS9wLvy++8CvlRhLR3hMxFKIGkt2VYHst3k2yLipgpLajtJXwA2kV3CcAL4XeAOYA+wmuySk60RMXmiYUFxgMwSeBfOLIEDZJbAATJL4ACZJXCAzBI4QDUmac3Es6FbjP162fV0IweoS0TEG6quYSFygBYISWslPSjpZyV9I78O6SFJ6/LXz+Y/P5q/dkjSE5I+mz//zgnv+1TeYdBacIAWAEkXAX8PvBu4GviTiNgA9JE1NhsXETfmr20iuxzhzyS9hqy15hvz1/4P+KXOVF9vPhu7/nrJzjm7KiKOSDoIfFjShcA/RMQjk9+Qt9H8a2BXRDwg6QPAT5L1sQX4EbrgRNB28Bao/p4GHgc2AkTEbcAW4H+BfZLeNMV7dgLHIuKz+WMBu/OrZzdExEURsbP0yhcAB6j+fgC8DfhlSdflJ7IejYibybZMr5s4WNIvAFcC2yc8vR+4Or92aWxtg1d0pPqa8y7cAhAR5yS9BbgL+CfgHZJGya4K/f1Jw3eQNXf+Rr67tjcibpT0EbIraBvAKPB+6rOIf2V8NrZZAu/CmSVwgMwSOEBmCRwgswQOkFkCB8gsgQNkluD/AeDcPj5PCDaEAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.relplot(data=jaccard_df, x=\"ksize\", y=\"jaccard\", height=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e3d860cc-45c8-4b94-9d00-fc15f874276b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:32.809965Z", + "iopub.status.busy": "2024-11-12T21:31:32.809855Z", + "iopub.status.idle": "2024-11-12T21:31:32.888595Z", + "shell.execute_reply": "2024-11-12T21:31:32.888321Z", + "shell.execute_reply.started": "2024-11-12T21:31:32.809955Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAANEAAADRCAYAAABSOlfvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAT20lEQVR4nO3dfZRcdX3H8fdncHVxkwh5piFhCSfVhqMGWBWVeEKtdk3VoGiEHkU4HGN7eDyISoVaWquHUkzbeFrlsYAnPqQoEBUjEFHAIrCGhbBBDYRVSfO4WvJAl2yYb/+4v0kmy8zu/Hbm7syd+b7OuWfu/c29M99J5ru/e+/8HmRmOOfGLlfvAJzLOk8i56rkSeRclTyJnKuSJ5FzVcp8EnV3dxvgiy/jsZSU+STauXNnvUNwLS7zSeRcvXkSOVclTyLnqvSKegeQhnze6B/Yy7Zdg8yY1E7nlA5yOdVtf9fcmi6J8nljTd9WLlnVy+BQnva2HMuXLqD7+Jklv+jjsb8naHNT1hugdnV1WU9Pz4HtTTv2sHjFAwwO5Q+UtbfluOvChcydNuFlx6e5f9oJWjjGk27clPyHbbprom27Bg/5ggMMDuXZvntw3PfvH9h7ICEK+12yqpf+gb0lXzt2/0LSLV7xAGde/zCLVzzAmr6t5POl/zDm88amHXt46JmdbNqxp+x+Lk7TJdGMSe20tx36sdrbckyf2D7u+6ed0DFJF5twrnJNl0SdUzpYvnTBgS964ZSoc0rHuO+fdkKnWSuC11yVarobC7mc6D5+Jq+7cCHbdw8yfeLI1wlp7l9IuOHXOKMlaKX7F5Ju+PVZbK1Y6tpvLNdnrarpbiw0msKFfyUJGrt/zBc97RsuLaLkf0TT1USNJpcTc6dNqPiLF7N/mrVibM0FrXun0JMo4ypNutjT1phTRWjt07+mu7Hgyisk3MlzpzJ32oQRv9yxN1zGcuOiWXhN5EqKrbnGcvrXLDyJXFkx12exp3/NxE/nXE3Env41E6+JXE3Env41E08iVzOxt/ObhSeRq4tm+k3Jk8iNu2b7TclvLLhx12y/KaWaRJJmS7pP0gZJfZIuCuWTJd0jaWN4PDKUS9IKSU9LekLSiWnG5+ojtstHo0u7JtoPfMrM5gMnA+dJmg9cBqw1s3nA2rAN8B5gXliWAV9NOT5XB7FdPhpdqklkZlvMbF1Y3w08BcwClgC3hN1uAU4L60uAWy3xc+AISUelGaMbf832m9KYbixIygETzGxXxDGdwAnAw8AMM9sSntoKzAjrs4DfFR32XCjbUlSGpGUkNRVz5swZwydw9dRsvylVXBNJ+oakSZI6gCeBDZI+XeGxE4DvABcPTzxLOjRFdWoys+vMrMvMuqZNmxZzqGsQMY1hG13M6dz8kACnAT8EjgU+NtpBktpIEmilmX03FG8rnKaFx+2hfDMwu+jwo0OZcw0rJonaQkKcBqw2syFGqUEkCbgReMrMlhc9tRr4eFj/OHBnUflZ4S7dycDzRad9zjWkmGuirwH9wOPA/ZKOAUa7Jno7SW21XlJvKPsccBWwStK5wG+ApeG5u4DFwNPAC8A5EfE5VxcVJVG4kbDNzGYVlf0WOHWk48zsQcr0SwfeWWJ/A86rJCbnGkVFp3Nmlgc+M6zMzGx/KlE5lyEx10T3Sro0tEKYXFhSi8y5jIi5JvpIeCw+3TJgbu3CcS57Kk4iMzs2zUCcy6qYH1tfLekKSdeF7XmS3pteaM5lQ8w10X8C+4C3he3NwD/WPCLnMiYmiY4zs6uBIQAze4Hyt6+daxkxSbRP0uGEVgqSjgNeTCUq5zIk5u7c3wFrgNmSVpK0Rjg7jaCcy5KYu3P3SFpH0rlOwEVmtjO1yJzLiNhOebOAw4BXAu+Q9MHah+RctlRcE0m6CXgD0AcUOsgb8N2yBznXAmKuiU4OYyU4N+4aeZy6mCR6SNJ8M9uQWjTOldDo49TFXBPdSpJIvwrDWa2X9ERagTlX0Ojj1MXURDcSOthx8JrIudQ1+txHMUm0w8xWpxaJc2U0+txHMadzj4URf86U9MHCklpkzgWNPk5dTE10OEkzn3cXlfktbpe6Rh+nLiaJPmVmvy8ukOR9jNy4aOS5j2JO574naVJhQ9KfAN+rfUjOZUtMEn2JJJEmSDoJuA34aDphOZcdMQ1QfxAGb7wbmAh8wMx+nVpkzmXEqEkk6SscOtLpa4BngPMlYWYXphWcc1lQSU3UM2z7F2kE4lxWjZpEZnbLaPsASPqOmZ1efUjOZUstJ/ny8edcS6plEkXNMeRcs/DZw52rUi2T6GVtMCTdJGm7pCeLyq6UtFlSb1gWFz33N2Hm8F9J+vMaxuZcaqKSSNLhkl5b5unPlii7GeguUf4vZrYgLHeF154PnAEcH475D0mHxcTnXD3EDCP8PqCXZNgsJC2QdKBrhJndPfwYM7sf+P3w8jKWAN8ysxfN7FmSib7eXGl8ztVLTE10JcmX+n8BzKyXZN7WsTg/9I69SdKRoazczOEvI2mZpB5JPTt27BhjCM7VRkwSDZnZ88PKxnJH7qvAccACYAvw5dgX8NnDXSOJSaI+SX8JHBZmhPgK8N+xb2hm28zspTD73vUcPGXzmcNdJsUk0QUkF/0vAt8kmfT44tg3lHRU0eYHgMKdu9XAGZJeFfopzQMeiX1958ZbTCvuF4DLw1IRSd8EFgFTJT1HMp73IkkLSE4F+4FPhtfvk7QK2ADsB84zs5cqfS/n6kXJhN0V7CjdR4lrIDP701oHFaOrq8t6eoa3kXUuFSX7o8d0D7+0aL0dOJ2kxnCupcWczg3vAvEzSX7N4lpezID2k4s2c8BJJB30nGtpMadzvyC5JhLJadyzwLlpBOVclsSczvnwWM6VEHM6N+Jop2bmgzi6lhRzOncu8Dbgx2H7VJIWCzvwkVBdC4tJojZgvpltgQMtD242s3NSicy5jIhp9jO7kEDBNmBOjeNxLnNiaqK1kn5E0m4O4CPAvbUPyblsibk7d364ubAwFF1nZrenE5Zz2RFTExXuwPkNBOeKVDKM8INmdoqk3RzaAFWAmdmkMoc61xIqGQH1lPA4Mf1wnMueqNO5MPrOjOLjzOy3tQ7KuSyJabFwAUmnum0cnD3cgDekEJdzmRFTE10EvNbMBtIKxrksivmx9XfA8NF+nGt5MTXRJuAnkn5AMlgJAGa2vOZROZchMUn027C8MizOOeJaLPx9moE4l1Uxd+cacrQf5+rNR/txrko+2o9zVfLRfpyrko/241yVajbaj6R3mdk91YfkXLbUcs7Wf6rhazmXGfWY+HiypHskbQyPR4ZySVoRJj5+QtKJNYzNtZh83ti0Yw8PPbOTTTv2kM+PZT66ytQyiUpFeTMvn/j4MmCtmc0D1oZtgPeQzEk0D1hGMqOec9HyeWNN31YWr3iAM69/mMUrHmBN39bUEqmWSfQyZSY+XgLcEtZvAU4rKr/VEj8Hjhg2IZhzFekf2Mslq3oZHEp67AwO5blkVS/9A3tTeb9aJlF/hfvNKBp6aytJJz/wiY9djWzbNXgggQoGh/Js3z2YyvvF9mx9G9DJoT1bbw2PIw4zXIqZmaToOtbMrgOug2SSr9jjXXObMamd9rbcIYnU3pZj+sT2VN6v4ppI0teBa4BTgDeFpWsM77mtcJoWHreHcp/42NVE55QOli9dQHtb8vVub8uxfOkCOqd0pPJ+MTVRF8kwwtX+5V8NfBy4KjzeWVR+vqRvAW8Bnh824qpzFcnlRPfxM3ndhQvZvnuQ6RPb6ZzSQS5XcrbIqsUk0ZPATKDiL3aZiY+vAlZJOhf4DbA07H4XsBh4GngB8DG+3ZjlcmLutAnMnTYh9feKSaKpwIbQ6LS4Z+v7yx1gZmeWeeqdJfY14LyIeJxrCDFJdGVaQTiXZTFt53460vOSHjKzt1YfknPZUsvfidK5f+hcg0u72Y9zTS/VZj/OtYJUW3E71wpiWixcUOi2UMbHahCPc5kTUxPNAB6VtEpSt6RDah4ze7LMcc41tYqTyMyuIOnrcyNwNrBR0pckHZdSbM5lQtQ1UWhVsDUs+4EjgdskXZ1CbM5lQsyQWRcBZwE7gRuAT5vZkKQcsBH4TDohOtfYYpr9HAl80Mx+U1xoZnlJ761tWM5lR0Wnc2GayTOGJ1CBmT1V06icy5CKksjMXgJ+JWlOyvE4lzmxp3N9oSvEgREfRuoK4VwriEmiv00tCucyLKorhKRjgHlmdq+kVwOHpReac9kQ0+znE8BtwLWhaBZwRwoxOZcpMT+2nge8HdgFYGYbgelpBOVclsQk0Ytmtq+wIekVeB8i56KS6KeSPgccLuldwH8B30snLOeyIyaJLgN2AOuBT5IMcXVFGkE5lyUxd+fywPVhcc4FMQ1Qn6XENZCZza1pRM5lTOwwwgXtwIeByWX2da5lxHTKGyhaNpvZvwJ/kV5ozmVDzOlc8fSPOZKaKWpqFueaUUwSfJmD10T7SSb1+nCtA3Iua2KS6PskSVQYoMSA9xbGKzGz5bUNzblsiEmik0gm9rqTJJHeBzxC0jU8mqR+YDfwErDfzLokTQa+TTIbXz+w1Mz+MJbXd268xCTR0cCJZrYbQNKVwA/M7KNVvP+pZrazaLsws/hVki4L25+t4vWdS13suHP7irb3cXDS4lopN7O4cw0rpia6FXhE0u1h+zTg5ire24C7w8TH14bJjMvNLH4IScuAZQBz5niPdVdfipmCNdzmXhg27zezx8b8xtIsM9ssaTpwD3ABsNrMjija5w9mNtLQxXR1dVlPT89Yw3AuRsnx5qN+5zGzdcC6WkRjZpvD4/ZQu72ZMLO4mW0ZNrO4cw2rLlOrSOqQNLGwDrybZGLlwszicOjM4s41rHq1OJgB3B5+Y3oF8A0zWyPpUUrPLO5cw6pLEpnZJuCNJcoHKDGzuHONzGfKc65KnkTOVcmTyLkqeRI5VyVPIueq5EnkXJU8iZyrknfvdi0vnzf6B/aybdcgMya10zmlg1yuZDO5kjyJXEvL5401fVu5ZFUvg0N52ttyLF+6gO7jZ1acSH4651pa/8DeAwkEMDiU55JVvfQP7B3lyIM8iVxL27Zr8EACFQwO5dm+e7Di1/Akci1txqR22tsOTYP2thzTJ7ZX/BqeRK6ldU7pYPnSBQcSqXBN1Dmlo+LX8BsLrqXlcqL7+Jm87sKFbN89yPSJfnfOuWi5nJg7bQJzp00Y2/E1jse5luNJ5FyVPImcq1LUkFmNSNIOkvEYms1UYOeoe2Vflj7nTjPrHl6Y+SRqVpJ6zKxr9D2zrRk+p5/OOVclTyLnquRJ1Liuq3cA4yTzn9OviZyrktdEzlXJk8i5KnkS1YGk2ZLuk7RBUp+ki0L5FyQ9IalX0t2S/iiUS9IKSU+H508c+R0axxg+6yJJz4fyXkmfr+8nqICZ+TLOC3AUydSdABOBXwPzgUlF+1wIfC2sLwZ+SDI/zsnAw/X+DCl+1kXA9+sdd8ziNVEdmNkWS+Z6wpI5cJ8CZpnZrqLdOkhmE4RkGs5bLfFz4Igwf1PDG8NnzRzvClFnkjqBE4CHw/YXgbOA54FTw26zgN8VHfZcKNtChlT4WQHeKulx4H+AS82sb5xDjeI1UR1JmgB8B7i48JfZzC43s9nASuD8esZXSxGfdR1wjJm9EfgKcEcdwo3iSVQnktpIvlQrzey7JXZZCZwe1jcDs4ueOzqUZULMZzWzXWa2J6zfBbRJmjpuwY6BJ1EdKJki8EbgKTNbXlQ+r2i3JcAvw/pq4Kxwl+5k4Hk7OMt6Q4v9rJJmhmOQ9GaS7+jA+EUcz6+J6uPtwMeA9ZJ6Q9nngHMlvRbIk3Tv+Kvw3F0kd+ieBl4AzhnXaKsT+1k/BPy1pP3A/wFnWLht16i82Y9zVfLTOeeq5EnkXJU8iZyrkieRc1XyJHKuSp5EzlXJk8i9jKT+Rm8l0Eg8iTJIUqZ/JM96/MN5Eo0DSZdL+rWkByV9U9Klkn4iqSs8P1VSf1g/TNI/S3o0dFr7ZChfJOkBSauBDZL+QdLFRe/xxUKHtxLvvyi8322SfilpZaFpzShxHy7ph5I+IakzHHtz+CwrJf2ZpJ9J2hia6CCpQ9JNkh6R9JikJaH8bEmrJf0YWCvpKEn3h453T0paWN2/ch3Vu0NTsy/AScB64NXAJJKmO5cCPwG6wj5Tgf6wvgy4Iqy/CugBjiXprLYXODY81wmsC+s54BlgSpkYFpF0Nzg67PsQcMoIMfeH178XOKvo/fYDrw+v8QvgJpKOgkuAO8J+XwI+GtaPIOmE1wGcTdKFY3J47lPA5WH9MGBivf+vxro0VbXaoBYCt5vZCwChJhnJu4E3SPpQ2H4NMA/YBzxiZs8CmFm/pAFJJwAzgMfMbKSGmo+Y2XMhhl6SpHhwhP3vBK42s5VFZc+a2frwGn3AWjMzSevD6xXif7+kS8N2OzAnrN9jZr8P648CN4UW3neYWe8IsTQ0P52rn/0c/PcvnttQwAVmtiAsx5rZ3eG54bPx3kDyF/4cklphJC8Wrb/E6I2PfwZ0DzvtK36NfNF2vuj1BJxeFP8cM3tqePxmdj/wDpIuHTdLOmuUeBqWJ1H67gdOC9cXE4H3hfJ+klM9SFouF/yIpBVzG4CkP5ZUbu7D24Fu4E3huFr6PPAH4N8jj/sRcEFRd4YTSu0k6Rhgm5ldT/LHIDODrwznSZQyS8YX+DbwOMlgI4+Gp64hSZbHSK6JCm4ANgDrJD0JXEuZWsPM9gH3AavM7KUUwr8IOFzS1RHHfAFoA54Ip3xfKLPfIuDx8Pk/AvxbNYHWk3eFGGeSrgT2mNk1NXitHEl36g+b2cZqX8+NjddEGSVpPsmdvrWeQPXlNVETkfR64OvDil80s7eU2f92ktvnxT5rZrW+vmpqnkTOVclP55yrkieRc1XyJHKuSp5EzlXp/wHBQ+FyS9L9xQAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.relplot(data=jaccard_df, x=\"query_n_kmers\", y=\"query_n_unique_kmers\", height=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8a341b04-2470-443f-8758-9df1783e79cf", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:32.889054Z", + "iopub.status.busy": "2024-11-12T21:31:32.888946Z", + "iopub.status.idle": "2024-11-12T21:31:32.965630Z", + "shell.execute_reply": "2024-11-12T21:31:32.965373Z", + "shell.execute_reply.started": "2024-11-12T21:31:32.889044Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAANEAAADQCAYAAACZZoRKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAARjUlEQVR4nO3dfbBcdX3H8fdn8bYX81BJSIITiTFMBoUZm8qtpQgtaLUxRUHUINbHYYp/8OTEJ6pUaad1qFPTNnZqhYpA60NTH0OlqTG1ghXFgCkQrI3GK5IhNw9aEkIvXNhv/zhnw2bZm9zfPXvu7tn9vGZ2ds/Zc85+b2a/Ob9z9vf7fRURmNn01bodgFnVOYnMCnISmRXkJDIryElkVlDlk2jlypUB+OHHTDzaqnwS7d27t9sh2ICrfBKZdZuTyKwgJ5FZQc/odgBmU1GvB6P7DjK2f5xFc4dZOn8WtZo6sn3qsVs5iaxjyvri1uvBxm27WLN+K+MTdYaHaqxdvYKVp57Qdp+U7VOP3Y6q3gF1ZGQktmzZ0u0w+lJZX/TUL+6OPY+wat3tjE/UD60bHqpx6xVnsWzB7ELbJx677R/va6IBUq8HO/Y8wh0/3suOPY9Qr0/+H2jji75q3e1cdP13WbXudjZu2zXpPqP7Dh5KCoDxiTpr1m9ldN/BQtsCjO0fP+xL3thn94HxwtunHrsdJ9GAKDMpoNwv7qK5wwwPHf5VHR6qsXDOcOHtU4/djpOo4qZ6din7f/8yv7hL589i7eoVh/ZpNP+Wzp9VePvUY7fjGwsVlnJtcaSkaHdd0fiit14rHO2L3hrLkb64U9kWoFYTK089gedfcRa7D4yzcM6Rr89Stk89dju+sVBhJV5AT+uuVeNGxFS+jCnb9pC2AfpM1GNS7oilnF3K/t+/sc+yBbPbJmWRbXudk6iHpP7vn9LkKjspBplvLPSQ1Iv/1IviRlKcvux4li2YXYXmUyX4TNRDUi/+O3FRbMU5iXpI6h0xcJOrF7g510M68ZuFzTyfiXqIm2fV5CTqMW6eVY+TqGRFx6pY73MSlagTY1Ws9/nGQolSf/exaio1iSSdKOkbku6XtE3Slfn6eZI2SdqePx+Xr5ekdZJ+JOkeSS8qM76ydWKsivW+ss9ETwDviohTgNOBSyWdAlwFbI6I5cDmfBnglcDy/HEJ8PGS4ytVJ8aqWO8rNYki4qGIuDt/fQD4AbAYOA+4Kd/sJuD8/PV5wM2R+Q7wLEnPLjPGMvl3n8EwrRsLkmrA7IjYn7DPUuDXgO8CiyLiofytXcCi/PVi4GdNuz2Yr3uoaR2SLiE7U7FkyZJp/AUzw7/7DIYpn4kkfUbSXEmzgPuA+yW9Z4r7zga+ALyzNfEiG9CUNKgpIq6LiJGIGFmwYEHKrjPOnT77X0pz7pQ8Ac4H/hV4HvDmo+0kaYgsgT4dEV/MV481mmn58+58/U7gxKbdn5OvM+tZKUk0lCfE+cCGiJjgKGcQSQI+CfwgItY2vbUBeGv++q3AV5rWvyW/S3c68HBTs8+sJ6VcE/0dMAr8F3CbpOcCR7smegnZ2epeSVvzde8HrgXWS7oY+CmwOn/vVmAV8CPgUeDtCfGZdcWUkii/kTAWEYub1j0AnHOk/SLiW0wyLh14WZvtA7h0KjGZ9YopNeciog68t2VdRMQTpURlViEp10Rfl/TuvBfCvMajtMjMKiLlmujC/Lm5uRXAss6FY1Y9U06iiHhemYGYVVXKj63PlHS1pOvy5eWSzi0vNLNqSLkm+hTwOHBGvrwT+NOOR2RWMSlJdFJEfASYAIiIR5n89rXZwEhJosclHUveS0HSScBjpURlViEpd+c+BGwETpT0abLeCG8rIyizKkm5O7dJ0t1kg+sEXBkRe0uLzKwiUgflLQaOAX4J+C1JF3Q+JLNqmfKZSNINwAuBbUBj4oAAvjjpTmYDIOWa6PR8roSB57nkrFlKEt0h6ZSIuL+0aCrAc8lZq5RropvJEumH+XRW90q6p6zAepXnkrNWKWeiT5IPsOOpa6KBk1pDyPpfShLtiYgNpUVSEdOpIWT9LaU59/18xp+LJF3QeJQWWY/yXHLWKuVMdCxZN59XNK0buFvcnkvOWqUk0bsi4ufNKyQN5Bgj1xCyZinNuVskzW0sSHoBcEvnQzKrlpQk+jBZIs2WdBrweeBN5YRlVh0pHVC/mk/e+DVgDvCaiPif0iIzq4ijJpGkj3H4TKe/AvwYuEwSEXFFWcGZVcFUzkRbWpbvKiMQs6o6ahJFxE1H2wZA0hci4rXFQzKrlk4W+fL8czaQOplESTWGzPqFq4ebFdTJJHpavxdJN0jaLem+pnXXSNopaWv+WNX03h/mlcN/KOl3OxibWWmSkkjSsZJOnuTt97VZdyOwss36v4yIFfnj1vzYpwBvAE7N9/lbScekxGfWDSnTCL8K2Eo2bRaSVkg6NDQiIr7Wuk9E3Ab8vHX9JM4DPhcRj0XET8gKfb14qvGZdUvKmegasi/1/wJExFayuq3TcVk+OvYGScfl6yarHP40ki6RtEXSlj179kwzBLPOSEmiiYh4uGXddO7IfRw4CVgBPAR8NPUAVaoebv0vJYm2SXojcExeEeJjwLdTPzAixiLiybz63vU81WRz5XCrpJQkupzsov8x4LNkRY/fmfqBkp7dtPgaoHHnbgPwBkm/nI9TWg7cmXp8s5mW0ov7UeAD+WNKJH0WOBs4XtKDZPN5ny1pBVlTcBR4R378bZLWA/cDTwCXRsSTU/0ss25RVrB7ChtK36DNNVBEvLTTQaUYGRmJLVta+8ialaLtHAApw8Pf3fR6GHgt2RnDbKClNOdah0D8pyRfs9jAS5nQfl7TYg04jWyAntlAS2nO3UV2TSSyZtxPgIvLCMqsSlKacwM5PZbZ0aQ0544422lEDNQkjmYNKc25i4EzgH/Pl88h67GwhwGcCdWsISWJhoBTIuIhONTz4MaIeHspkZlVREq3nxMbCZQbA5Z0OB6zykk5E22W9G9k/eYALgS+3vmQzKol5e7cZfnNhbPyVddFxJfKCcusOlLORI07cL6BYNZkKtMIfysizpR0gMM7oAqIiJg7ya5mA2EqM6CemT/PKT8cs+pJas7ls+8sat4vIh7odFBmVZLSY+FyskF1YzxVPTyAF5YQl1llpJyJrgROjoh9ZQVjVkUpP7b+DGid7cds4KWciXYA/yHpq2STlQAQEWs7HpVZhaQk0QP545fyh5mR1mPhj8sMxKyqUu7O9eRsP2bd5tl+zArybD9mBXm2H7OCPNuPWUEdm+1H0ssjYlPxkMyqpZM1W/+8g8cyq4xuFD6eJ2mTpO3583H5eklalxc+vkfSizoY2xHV68GOPY9wx4/3smPPI9Tr06ldZoOqk0nU7pt3I08vfHwVsDkilgOb82WAV5LVJFoOXEJWUa909XqwcdsuVq27nYuu/y6r1t3Oxm27nEg2ZZ1MoqeZpPDxecBN+eubgPOb1t8cme8Az2opCFaK0X0HWbN+K+MT2eiO8Yk6a9ZvZXTfwbI/2vpEJ5NodIrbLWqaemsX2SA/6FLh47H944cSqGF8os7uA+OFjmuDI3Vk6xnAUg4f2Xpz/nzEaYbbiYiQlNxuiojrgOsgK/KVun+zRXOHGR6qHZZIw0M1Fs4ZLnJYGyBTPhNJ+gfgL4AzgV/PHyPT+MyxRjMtf96dr+9K4eOl82exdvUKhoeyf4rhoRprV69g6fxZZX+09YmUM9EI2TTCRa+4NwBvBa7Nn7/StP4ySZ8DfgN4uGXG1VLUamLlqSfw/CvOYveBcRbOGWbp/FnUam0rC5o9TUoS3QecAEz5iz1J4eNrgfWSLgZ+CqzON78VWAX8CHgUmLE5vms1sWzBbJYtmD1TH2l9JCWJjgfuzzudNo9sffVkO0TERZO89bI22wZwaUI8Zj0hJYmuKSsIsypL6Tv3zSO9L+mOiPjN4iGZVUsnfyfyPWEbSGV3+zHre6V2+zEbBKX24jYbBCk9Fi5vDFuYxJs7EI9Z5aSciRYB35O0XtJKSYedeSLivkn2M+trU06iiLiabKzPJ4G3AdslfVjSSSXFZlYJSddEea+CXfnjCeA44POSPlJCbGaVkDJl1pXAW4C9wN8D74mICUk1YDvw3nJCNOttKd1+jgMuiIifNq+MiLqkczsblll1TKk5l5eZfENrAjVExA86GpVZhUwpiSLiSeCHkpaUHI9Z5aQ257blQyEOzeJxpKEQZoMgJYn+qLQozCosaSiEpOcCyyPi65KeCRxTXmhm1ZDS7ecPgM8Dn8hXLQa+XEJMZpWS8mPrpcBLgP0AEbEdWFhGUGZVkpJEj0XE440FSc/AY4jMkpLom5LeDxwr6eXAPwO3lBOWWXWkJNFVwB7gXuAdZFNcXV1GUGZVknJ3rg5cnz/MLJfSAfUntLkGiohlHY3IrGJSpxFuGAZeD8ybZFuzgZEyKG9f02NnRPwV8HvlhWZWDSnNuebyjzWyM1NSaRazfpSSBB/lqWuiJ8iKer2+0wGZVU1KEv0LWRI1JigJ4NzGfCURsbazoZlVQ0oSnUZW2OsrZIn0KuBOsqHhySSNAgeAJ4EnImJE0jzgn8iq8Y0CqyPiF9M5vtlMSUmi5wAviogDAJKuAb4aEW8q8PnnRMTepuVGZfFrJV2VL7+vwPHNSpc679zjTcuP81TR4k6ZrLK4Wc9KORPdDNwp6Uv58vnAjQU+O4Cv5YWPP5EXM56ssvhhJF0CXAKwZIlHrFt3KaUEa36b+6x88baI+P60P1haHBE7JS0ENgGXAxsi4llN2/wiIo40dTEjIyOxZcuW6YZhlqLtfPNJv/NExN3A3Z2IJiJ25s+787Pbi8kri0fEQy2Vxc16VldKq0iaJWlO4zXwCrLCyo3K4nB4ZXGzntWtHgeLgC/lvzE9A/hMRGyU9D3aVxY361ldSaKI2AH8apv1+2hTWdysl7lSnllBTiKzgpxEZgU5icwKchKZFeQkMivISWRWUF8O767Xg9F9BxnbP86iucMsnT+LWq1ttyezwvouier1YOO2XaxZv5XxiTrDQzXWrl7BylNPcCJZKfquOTe67+ChBAIYn6izZv1WRvcdPMqeZtPTd0k0tn/8UAI1jE/U2X1gvEsRWb/ruyRaNHeY4aHD/6zhoRoL5wx3KSLrd32XREvnz2Lt6hWHEqlxTbR0/qwuR2b9qu9uLNRqYuWpJ/D8K85i94FxFs7x3TkrV98lEWSJtGzBbJYtmN3tUGwA9F1zzmymOYnMCnISmRWUNGVWL5K0h2w+hl53PLD3qFv1h379W/dGxMrWlZVPoqqQtCUiRo6+ZfUN0t8Kbs6ZFeYkMivISTRzrut2ADNokP5WXxOZFeUzkVlBTiKzgpxEM0DSqKR7JW2V1Fd1YCTdIGm3pPua1s2TtEnS9vz5iOVxqs5JNHPOiYgVffj7yY1A6w+QjbKhy4HN+XLfchJZIRFxG/DzltUDVTbUSTQzGqU178pLZfa7KZUN7Rd9OZ6oB53ZXFpT0n/n/4P3vYiIvC5v3/KZaAY0l9YEGqU1+9lYXi6UQSgb6iQq2RFKa/azgSob6h4LJZO0jOzsA0+V1vyzLobUUZI+C5xNNvxhDPgQ8GVgPbCEvGxoRLTefOgbTiKzgtycMyvISWRWkJPIrCAnkVlBTiKzgpxEfULS0uae1EfZ9ttlxzNInEQDKCLO6HYM/cRJ1IckLZP0fUm/LenOfBzTPZKW5+8/kj//Sf7eVkk7JX0qX/+mpv0+IemYbv49vc5J1GcknQx8AXgb8DrgryNiBTACPNi8bUR8MH/vbLLhDH8j6QXAhcBL8veeBH5/ZqKvJvfi7i8LyPqpXRAR90u6A/iApOcAX4yI7a07SBLwj8DaiLhL0mXAacD3src4lj7vQFqUz0T95WHgAeBMgIj4DPBq4P+AWyW9tM0+1wAPRsSn8mUBN+WjcFdExMkRcU3pkVeYk6i/PA68BniLpDfmnV93RMQ6sjPUC5s3lvQq4HeAK5pWbwZel499asyX8NwZib6i3JzrMxFxUNK5wCbgFuBCSRNkI0w/3LL5GmAxcGfedNsQER+UdDXZSNwaMAFcSjWKBnSFe3GbFeTmnFlBTiKzgpxEZgU5icwKchKZFeQkMivISWRW0P8DFSEH9VsdLG8AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.relplot(data=jaccard_df, x=\"ksize\", y=\"query_n_unique_kmers\", height=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5e8a9002-2073-41f7-a7d2-a4ca316a01ce", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:32.966566Z", + "iopub.status.busy": "2024-11-12T21:31:32.966453Z", + "iopub.status.idle": "2024-11-12T21:31:33.108691Z", + "shell.execute_reply": "2024-11-12T21:31:33.108401Z", + "shell.execute_reply.started": "2024-11-12T21:31:32.966556Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPwAAADRCAYAAADomd+PAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAapUlEQVR4nO3de5hU1ZX38e+vu7lIy51GlIugIgmKAWzRRCYhShzUBMJo4iV5oxMzzuSJ0Ywmo8Ykw+DknfiYODET48QxRk28RB0xTOJ4I/iaiwotIAqIEkQuIjQICA100816/zi7TdHd1V1dVaeqq8/6PE89XXUu+yxKV51Tu87aW2aGcy4ZyoodgHOucDzhnUsQT3jnEsQT3rkE8YR3LkFKLuFnzJhhgD/8UYhHt1NyCb9t27Zih+BcySq5hHfOZc8T3rkE6TYJ33SwkaamxmKH4VyXVlHsAHJ10A6ydc/brHrnJRqa9jFu6CSO6nc0PSt6Fzs057qckj/Db6/bwsI3HmXz7nVs37uFP617gk3vrSt2WM51SSWf8O/sXo+1+AVl1ZaXONDUUKSInOu6Yk14STMkrZa0RtJ1bawfJWmhpKWSlks6p7PHqFCPNpcJZRm1c91XbAkvqRy4DTgbGA9cJGl8i82+BTxkZpOAC4GfdPY4w/qNpKLs0KQ/8cgpVJS3/iBwLuni7LSbAqwxs7UAkh4EZgErU7YxoF943h94u7MHGdiniunHn8/bu96kvqmeEf2PYXDlsBxDd657ijPhhwMbUl5vBE5tsc0c4ClJXwUqgeltNSTpcuBygFGjRrVaP6jPUAb1GZp7xM51c8XutLsIuNvMRgDnAL+Q1ComM7vDzKrNrLqqqqrgQTrXXcSZ8JuAkSmvR4RlqS4DHgIws+eB3sCQGGNyLtHiTPjFwFhJYyT1JOqUm99im/XAmQCSPkiU8LUxxuRcosWW8GbWCFwBPAmsIuqNXyFprqSZYbNrgL+T9DLwAHCp+aiazsVGpZZf1dXVVlNTU+wwXDJ0u5s5it1p55wrIE945xLEE965BPGEdy5BPOGdSxBPeOcSxBPeuQTxhHcuQTzhnUsQT3jnEsQT3rkE8YR3LkE84Z1LEE945xLEE965BPGEdy5BPOGdSxBPeOcSxBPeuQTxhHcuQTzhnUsQT3jnEsQT3rkE8YR3LkE84Z3LI0mjJb2a4bZ/ijueljzhnSsSM/tIoY/pCd/FNDQ2sGX3Fta9u44de3cUOxyXA0nHSFoq6WOSFklaJmm5pLFh/Z7wd25Yt0zSJkk/D8s/n7LfTyWV5xpTrAkvaYak1ZLWSLouzTaflbRS0gpJ98cZT1dX31jPoo013L/sV8xbMZ9fLn2A9Ts3FDsslwVJ44D/Bi4FzgduNbOJQDWwMXVbM/tOWDcNeBf4cZhN+QLg9LCuCfhcrnHFlvDh0+g24GxgPHCRpPEtthkLXE/0jzoB+Fpc8ZSCbXXbWLzhLxNlNh5s5KnXn6GuYW8Ro3JZqAJ+DXzOzF4Gnge+Kela4Ggz29dyB0kCfgncYmYvEU2jfjKwWNKy8PqYXAOL8ww/BVhjZmvNrAF4EJjVYpu/A24zsx0AZrY1xni6vD0Nda2W7a7fzf7GVv9/uK5tF7AemApgZvcDM4F9wOOSzmhjnznARjP7eXgt4B4zmxge48xsTq6BxZnww4HU69GNYVmq44HjJf1R0guSZsQYT5fXv3e/VsuqKodQ2aOyCNG4HDQAs4EvSLpY0jHAWjP7EdGZ/6TUjSV9CpgOXJmyeAFwvqShYZtBko7ONbBid9pVAGOJvrtcBPyXpAEtN5J0uaQaSTW1tbWFjbCABvcZzPTjzqCirAKAfr36Mn3smfTu0bvIkbnOMrM64JPAPwKfB14Nl+YnAve22PxqopNhcwfdXDNbCXwLeErScuBp4Mhc45KZ5dpG2w1LHwbmmNlfh9fXA5jZv6Vs85/Ai82XMZIWANeZ2eJ07VZXV1tNTU261SXPzNi5byf7G+vp17svlT397F5EKnYA+RbnGX4xMFbSGEk9gQuB+S22eYzo7I6kIUSX+GtjjKnLk8TAPgM5st8wT3aXd7ElvJk1AlcATwKrgIfMbEX4zXFm2OxJYLuklcBC4Btmtj2umJxLutgu6ePS3S/pXZfil/TOudLlCe9cgnjCO5cgnvDOdWGSRkpamFJvclUu7VXkKzDnku7+JbeWATOAycAS4ImLJ191MMdmG4FrzGyJpL7AS5KeDjfmdJqf4Z3Lg5Ds84DfAjeGv/PC8qyZ2WYzWxKe7yb6ibvlLeoZ84R3Lj9mEBXIpJoZlueFpNHAJODFbNvwhHcuPyanWT4pH41LOpyovv5rZvZetu14wjuXH0vSLF+aa8OSehAl+31m9mgubXnCO5cfT9C6VmR+WJ61MDDGz4BVZnZLLm2BJ7xzeRF642cD5xKVtZ4LzM5DL/3pwP8BzkgZ9+6cbBvze+mdS8/vpXfOla60N95I+g8g7enfzK5Mt8451zW1d4avAV4CehP95PBGeEwEesYemXMu79Ke4c3sHgBJXwamhgEtmoel+n1hwnPO5VMm3+EHAqnDqR4eljnnSkwmxTPfA5ZKWkjUa/lRojG0nXMlpt2El1QGrAZODQ+Aa83snbgDc85FwixONcAmM/tkLm21m/BmdlDSbWY2iWgAfedcGtc+8aNW5bE3zbgy1xtvAK4iqpJrPVNJJ2XyHX6BpPPCLX7OuTaEZG9VHhuWZ03SCKK79u7MOUgyS/i/Bx4G6iW9J2m3pKyrdZzrpuIqj/0h8E9APq4UOk54M+trZmVm1tPM+oXXOV9aONfN5L08VtInga1hNtm8yGiIK0kDieaAe3+SMzN7Ll9BONcNxFEeezowMxTL9Ab6SfqlmX0+2wY7PMNL+hLwHNEsMf8S/s7J9oDOdVN5L481s+vNbISZjSaaqu13uSQ7ZPYd/irgFOAtM/s40SXKzlwO6lx3E3rjW5XH5qmXPm86LI+VtNjMTglT3Z5qZvWSVpjZCQWJsAUvj3UF1O1+mcrkO/zGMGf7Y8DTknYAb8UZlHMuHpn00s82s51mNgf4NtFwO5/OpHFJMyStlrRG0nXtbHeeJJNUnWHczrksZNJpd1oYAB8z+3/As2TwU0O4HfA24GxgPHCRpPFtbNeXqJ8g66F3nXOZyaTT7nZgT8rrPWFZR6YAa8xsrZk1AA8Cs9rY7kbgJmB/Bm0653KQScLLUnr2zOwgmX33Hw5sSHm9kRYzZkiaDIw0s9+2G4B0uaQaSTW1tbUZHNo515ZMEn6tpCsl9QiPq4C1uR44VOLdAlzT0bZmdoeZVZtZdVVVVa6Hdi6xMkn4fwA+AmwiOkufClyewX6bgJEpr0eEZc36AicCz0paB5wGzPeOO+cOJWmApEckvSZplaQPZ9tWh5fmZraV6C6fzloMjJU0hijRLwQuTml3FzCk+bWkZ4Gvm5n/yO5K0uyft549dt7f5jwuPcCtwBNmdr6knkCfbBvKpJf+nvA7fPPrgZLu6mi/MAbeFUS34q4CHjKzFZLmSmpZVeRcSQvJ3qo8NizPmqT+RKNM/QzAzBrMbGe27WXS+XZS6gHMbIekjCqAzOxx4PEWy76TZttpmbTpXBfVXnns4603z9gYoBb4uaQPEY0kfZWZ1WXTWCafPmWhWg4ASYPIsMrOuQSJa/bYitD27WHkqTog7U1smTTWkR8Az0t6mOje4vOB72Z7QOe6qbhmj90IbDSz5hvTHiGHhM/k1tp7gfOALcA7wN+Y2S+yPaBz3VQss8eGAWM3SBoXFp0JrMy2vYwnk5Q0lEMHwFif7UFz4dVyroA6VS2X0ks/iejMnpdeekkTica060l0D8zfmtmOrNrKoDx2JtFl/VHAVuBoormqvTzWdXfdrjw2k067G4luinndzMYQXVK8EGtUzrlYZJLwB8xsO1FvfZmZLQT8bjjnSlAmvfQ7JR1ONIHkfZK2Ev004JwrMZmc4WcCe4lq1p8A1gA5TXfjnCuOtGd4SX8ws6lEP8c19+w1d2L8q6R3gZvN7Ccxx+icy5P25oefGv72bWu9pMHAnwBPeOdKRNY39oeOvGn5C8U515Kkf5S0QtKrkh6Q1LvjvdLL6Z54M9ucy/7OdSfV//eWVuWxNd+8OusbbyQNB64ExpvZPkkPEZWZ351tmzmV7jnnIiHZW5XHhuW5qAAOk1RBVAf/di6NecI7lx95nz3WzDYB3wfWA5uBXWb2VNYR4gnvXL7EMXvsQKKRnscQ3dpeKSn2ueWccx2Lozx2OvCmmdWa2QHgUaLxJbPmCe9cfsRRHrseOE1SH0kiqmNZlUN7nvDO5UPojW81e2wuvfRh0ItHiK4eXiHK1ztyiTPjeviuwstjXQElsjzWOddNeMI7lyCe8M4liCe8cwniCe9cgnjCO5cgnvDOdWGS7pK0VdKrKcsGSXpa0hvh78D22kgV65RRkmYQzXxZDtxpZt9rsf5q4EtAI9H8WV80s7fijMkVjpmxfe9Odu2vo2+vPgypHECZuu85ZtRXb25VHrv+P76R67j0dwM/Bu5NWXYdsMDMvifpuvD62kwai+3GG0nlwOvAJ4imy1kMXGRmK1O2+TjwopntlfRlYJqZXdBeu37jTelYuXUt97/8BAeaGqkoK+f8E8/kpGHHU15WMkmf8Y03IdnncWjF3Hxgdq5JL2k08BszOzG8Xk2UK5slHQk8a2bj2mujWZzv/BRgjZmtNbMG4EGiyp/3mdlCM9sbXr4AjIgxHldA2+t28qvlT3GgqRGAxoNNPPzKM9TWZTVhSinIe3lsO45IGXzmHeCITHeMM+GHAxtSXm8My9K5DPjftlZIulxSjaSa2traPIbo4rK7YS/7GxsOWdZkB3mvvtuOcB7X7LHtsugSPePL9C5xbRVqfKuBm9tab2Z3mFm1mVVXVVUVNjiXlb69+tC7ouchy8pVRr9elUWKKHZxzR7bli3hUp7wd2umO8aZ8JuAkSmvR4Rlh5A0HbgBmGlm9THG4wpocJ8BXHDSWfQs7wFARVk5n5nwCaoqM+5QLjWxzB6bxnzgkvD8EuDXme4YZ6ddBVGn3ZlEib4YuNjMVqRsM4mo/G+Gmb2RSbveaVc6zIxte3exe/8eDi/NXvpOVcul9NK/P3tsHjrsHiAaHXoI0RwR/ww8BjwEjALeAj5rZu9m1F6c5bGSzgF+SPSz3F1m9l1Jc4EaM5sv6RlgAtF4XQDrzaxlx8chPOFdAXW78livh3cuvW6X8CV1feWcy40nvHMJ4gnvXIJ4wjuXIJ7wziWIJ7xzXVia8tibJb0mabmkeZIGZNperOWxziXJcZ+e26o8ds1j34mjPPZp4Hoza5R0E3A9GZbH+hneuTwIyd5q9tiwPGtm9hzwbotlT5lZY3jZqSpTT3jn8qOQ5bGpvkiaKtO2eMI7lx8FL4+VdAPRaFH3ZbqPf4d3Lj8KWR6LpEuBTwJnWifuj/czvHP5UbDy2DBW5D8RlZTv7Wj7VJ7wzuVB6I1vNXtsrr30oTz2eWCcpI2SLiPqte8LPC1pmaT/zLg9r5ZzLi2vlnPOlS5PeOcSxBPeuQTxn+VcSWpobGTz7nfZe2A/Qyr7U1XZv9ghlQRPeFdy9jXU88TrS/jNqkUYUNmjF1dOncnYIUcVO7Quzy/pXclZv6uW/wnJDlB3oJ57X1rAnvp9RY2rFHjCu5KzY9+eVss2vfcuexr2FyGaeLVVHpuy7hpJJmlIpu35Jb0rOYP79G21bNSAKvr2OqwI0fzFhMnXtCqPfWXJD+Ioj0XSSOAsYH1nGvMzvCs5IwdU8dmTplIeJrUY0LuSS04+g8qevYsWU0j2VuWxYXnW2iqPDf6d6PbaTt0552d4V3J6V/Rk+tiJTBg2mr0H6hnSpy8D2zjrF1h75bGP5/NAkmYBm8zsZalzNwN6wruSVFFWzvD+g9Ou312/j407d9DQ1Mjw/gMZUhn7B0J75bF5S3hJfYBvEl3Od5onvOt2ttXt5qfP/46aDesAGFJ5ODdMn8XoQRn3bWWjUOWxxwJjgOaz+whgiaQpZvZORzv7d3jX7aza8vb7yQ6wrW4Pz7z+Ko0Hm+I8bEHKY83sFTMbamajzWw0sBGYnEmyQ8xn+FC3eyvRZJJ3mtn3WqzvRdT7eDKwHbjAzNbFGZPr/t56d9v7z6sq+3LWuJPYtOM9Fry2mmF9+1Nff5DDmsTmTe9SXl7GcccM4+iRuZ39X1nyg4MTJl8zmxazx+baS586e6ykjcA/m9nPsm0vtoSXVA7cBnyC6FNosaT5ZrYyZbPLgB1mdpykC4GbgAviisklw3FVw95/PmPcRH7y+98z44MncLBJrN28k6MqevHtf3uUP/95CwCTJo7mG1+fxYTxo3I6bkjux8njd3Yzu6iD9aM7016cl/RTgDVmttbMGoAHgVkttpkF3BOePwKcqc52OzrXwgeGHsm5H5zIsYOHsmTDRsrLyjhmUBW17+2jV3k5Cxe8+n6yAyxdto4XF60pYsSFE2fCDwc2pLzeGJa1uU0YdncX0KrrVdLlkmok1dTW1sYUrusuBhzWhy9Un87VH5vBgaYmBvXpQ2OTsbNuP0MP78Oyl9e12mfFik7dv1KySqLTzszuMLNqM6uuqqoqdjiuBPSsqOCo/gOZNeFDbK+ro0d5GQMrD2Pzrj2cMuXYVttP/NCYIkRZeHEm/CZgZMrrEWFZm9tIqgD6E3XeOZcXE0cM59szzmH73t0cMaAPViam/tUJTJw4+v1tpn3sBE45pfWHQHcU25h2IYFfB84kSuzFwMVmtiJlm68AE8zsH0Kn3d+Y2Wfba9fHtHPZ2H/gAA2NjWx5bw+79zbQu0nUvrOLiooyjj32CIZVDWhrt27XnxRbL32Y9+oK4Emin+XuMrMVkuYCNWY2H/gZ8AtJa4juF74wrnhcsvXu0YPePXrQ77CUApvjklc/76PWOpdetzvDl0SnnXMuPzzhnUuQkrukl1QLvBVT80OAbR1uVRhdKRboWvEUKpZtZhb37K8FVXIJHydJNWZWXew4oGvFAl0rnq4US6nxS3rnEsQT3rkE8YQ/1B3FDiBFV4oFulY8XSmWkuLf4Z1LED/DO5cgnvDOJUhiE17SOkmvSFomqSYsGyTpaUlvhL8DYzx+qxlF0h1fkR9JWiNpuaR0I6TmO545kjaF92iZpHNS1l0f4lkt6a/zHMtISQslrZS0QtJVYXnR3p/uIrEJH3zczCam/KZ7HbDAzMYCC8LruNxNNP5ZqnTHPxsYGx6XA7cXKB6Afw/v0UQzexxA0niiQqcTwj4/CUOa5UsjcI2ZjQdOA74SjlnM96dbSHrCt5Q65NY9wKfjOlCaGUXSHX8WcK9FXgAGSDqyAPGkMwt40MzqzexNYA3RkGb5imWzmS0Jz3cDq4hGRyra+9NdJDnhDXhK0kuSLg/LjjCzzeH5O8ARBY4p3fEzGS4sLleEy+S7Ur7iFCweSaOJRoF9ka75/pSUJCf8VDObTHQ5+BVJH01dadHvlUX7zbLYxw9uJ5r4YCKwGfhBIQ8u6XDgv4Gvmdl7qeu6yPtTchKb8Ga2KfzdSjQJ4BRgS/OlYPi7tcBhpTt+JsOF5Z2ZbTGzJjM7CPwXf7lsjz0eST2Ikv0+M3s0LO5S708pSmTCS6qU1Lf5OdE8Xa8SzRRySdjsEuDXBQ4t3fHnA18IvdGnAbtSLm1j0+J78Gyi96g5ngsl9ZI0hqizbFEejyui0ZBWmdktKau61PtTkswscQ/gGODl8FgB3BCWDybq/X0DeAYYFGMMDxBdJh8g+s55WbrjE428chvwZ+AVoLpA8fwiHG85UVIdmbL9DSGe1cDZeY5lKtHl+nJgWXicU8z3p7s8/NZa5xIkkZf0ziWVJ7xzCeIJ71yCeMI7lyCe8M4liCe8cwniCZ8Qkv5UoONcKunHhTiW6zxP+BiEiTS7FDP7SLFjyJc8l+Imiic8IOkGSa9L+oOkByR9XdKzkqrD+iGS1oXn5ZJulrQ4VJH9fVg+TdLvJc0HVkqaK+lrKcf4bvNADm0cf1o43iOSXpN0X7i9NF286yQNCc+rJT0bns8JVW3PSlor6cqUffaEv5L04zBwxTOSHpd0fgftVoZ2F0laKmlWhu/ruZKeD+/f3ZJul/RCiG1aaHOVpLtT9jkr7LNE0sOhgKY5tpskLQE+I+lKRQNkLJf0YCbxuBhnjy0Vkk4mGsxhItH7sQR4qZ1dLiO6V/sUSb2AP0p6KqybDJxoZm+Gss5HgR9KKgvHaK9mfBLRgBJvA38ETgf+kMU/6QPAx4G+wGpJt5vZgZT1s4FxwHii8tKVwF0dtHkD8Dsz+6KkAcAiSc+YWV26HSTNBq4GzjGzHeHzayDwYWAm0a26pwNfAhZLmkh0S++3gOlmVifp2tDG3NDsdosqHJH0NjDGzOpDTC4DiU944K+AeWa2FyCcodtzFnBS81kR6E9UPNIALLJoQAjMbJ2k7ZImESXWUjPb3k67i8xsY4hhGTCa7BL+t2ZWD9RL2hqOvTFl/UeBB8ysCXhb0u8yaPMsYKakr4fXvYFRRANTtOUMoBo4yw4ta/0fMzNJrwBbzOwVAEkriP69I4g+iP4YPiB6As+n7P+rlOfLgfskPQY8lsG/weEJ355G/vKVp3fKcgFfNbMnUzeWNA1oeca7E7gUGEbHZ9H6lOdNtP/fJl1snW0n03YFnGdmqzNs589EBUrHA6lzezfHdrBFnAdDnE3A02Z2UZp2U9/fc4k+vD4F3CBpgpk1ZhhfYvl3eHgO+LSkwxSVzH4qLF8HnByen5+y/ZPAlxXVayPpeEUltm2ZRzTm2ylhv3xJje28Tu77HHBB6Is4kujyv6N2nwS+2tyvEK5a2vNW2P9eSSd0IrYXgNMlHReOUynp+JYbha9II81sIXAt0VXW4Z04TmIlPuEtGjvtV0Slsv8LLA6rvk+U2EuJZittdifR994likZ4/SlpzqJm1gAsBB4Kl9D58i/ArYpG2+1su/OIyktXAvdy6CVzunZvBHoAy8Pl940dHcTMXgM+Bzws6dhMAjOzWqIrogckLQ+xfaCNTcuBX4avBkuBH5nZzkyOkXReHtuCpDnAHjP7fh7aKiPqBPyMmb2Ra3txCD3kvzGzR4odi4tf4s/wcVE0rPIaomGVu2Syu+TxM3wBSZpANIpMqnozOzXN9vOAMS0WX9uyw7AYFE0+cVOLxW+a2exixOMy4wnvXIL4Jb1zCeIJ71yCeMI7lyCe8M4lyP8Hquob3/fhN50AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.relplot(\n", + " data=jaccard_df,\n", + " x=\"query_n_unique_kmers\",\n", + " y=\"jaccard\",\n", + " height=3,\n", + " hue=\"ksize\",\n", + " palette=\"crest\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2aaf46a0", + "metadata": {}, + "source": [ + "# This value was dropped from .5 in the hp dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "19421cc7-be9f-45fc-b848-f29aa449a11e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:33.109164Z", + "iopub.status.busy": "2024-11-12T21:31:33.109056Z", + "iopub.status.idle": "2024-11-12T21:31:33.115226Z", + "shell.execute_reply": "2024-11-12T21:31:33.115001Z", + "shell.execute_reply.started": "2024-11-12T21:31:33.109153Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
querymatchmoltypeksizejaccardquery_n_kmersquery_n_unique_kmersmatch_n_kmersmatch_n_unique_kmers
0BCL2ced9dayhoff20.8181822382827932
1BCL2ced9dayhoff30.606061237100278112
2BCL2ced9dayhoff40.198113236172277209
3BCL2ced9dayhoff50.055046235204276256
4BCL2ced9dayhoff60.016667234215275273
\n", + "
" + ], + "text/plain": [ + " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n", + "0 BCL2 ced9 dayhoff 2 0.818182 238 28 \n", + "1 BCL2 ced9 dayhoff 3 0.606061 237 100 \n", + "2 BCL2 ced9 dayhoff 4 0.198113 236 172 \n", + "3 BCL2 ced9 dayhoff 5 0.055046 235 204 \n", + "4 BCL2 ced9 dayhoff 6 0.016667 234 215 \n", + "\n", + " match_n_kmers match_n_unique_kmers \n", + "0 279 32 \n", + "1 278 112 \n", + "2 277 209 \n", + "3 276 256 \n", + "4 275 273 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "jaccard_df.query(\"jaccard > 0.01\")" + ] + }, + { + "cell_type": "markdown", + "id": "02f77666-adb5-4aa2-b660-3807ca0848bb", + "metadata": {}, + "source": [ + "# Functionalize the analysis for any two sequences\n", + "\n", + "Thanks Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "cb300f9d-acbf-4663-91df-3689d270e907", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:33.115667Z", + "iopub.status.busy": "2024-11-12T21:31:33.115570Z", + "iopub.status.idle": "2024-11-12T21:31:33.206289Z", + "shell.execute_reply": "2024-11-12T21:31:33.206015Z", + "shell.execute_reply.started": "2024-11-12T21:31:33.115658Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
querymatchmoltypeksizejaccardquery_n_kmersquery_n_unique_kmersquery_intersection_positionsmatch_n_kmersmatch_n_unique_kmersmatch_intersection_positions
0CED9BCL2dayhoff50.055046276256[[21, bbced], [27, febed], [28, ebedb], [32, b...235204[[10, cdcee], [24, cdbfc], [33, cebbb], [34, e...
1CED9BCL2dayhoff60.016667275273[[27, febedb], [46, cebbbb], [48, bbbbdc], [49...234215[[33, cebbbb], [58, bbbbdc], [59, bbbdcb], [81...
2CED9BCL2dayhoff70.002033274274[[48, bbbbdcb]]233219[[58, bbbbdcb]]
3CED9BCL2dayhoff80.000000273273[]232222[]
4CED9BCL2dayhoff90.000000272272[]231223[]
5CED9BCL2dayhoff100.000000271271[]230224[]
\n", + "
" + ], + "text/plain": [ + " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n", + "0 CED9 BCL2 dayhoff 5 0.055046 276 256 \n", + "1 CED9 BCL2 dayhoff 6 0.016667 275 273 \n", + "2 CED9 BCL2 dayhoff 7 0.002033 274 274 \n", + "3 CED9 BCL2 dayhoff 8 0.000000 273 273 \n", + "4 CED9 BCL2 dayhoff 9 0.000000 272 272 \n", + "5 CED9 BCL2 dayhoff 10 0.000000 271 271 \n", + "\n", + " query_intersection_positions match_n_kmers \\\n", + "0 [[21, bbced], [27, febed], [28, ebedb], [32, b... 235 \n", + "1 [[27, febedb], [46, cebbbb], [48, bbbbdc], [49... 234 \n", + "2 [[48, bbbbdcb]] 233 \n", + "3 [] 232 \n", + "4 [] 231 \n", + "5 [] 230 \n", + "\n", + " match_n_unique_kmers match_intersection_positions \n", + "0 204 [[10, cdcee], [24, cdbfc], [33, cebbb], [34, e... \n", + "1 215 [[33, cebbbb], [58, bbbbdc], [59, bbbdcb], [81... \n", + "2 219 [[58, bbbbdcb]] \n", + "3 222 [] \n", + "4 223 [] \n", + "5 224 [] " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from typing import Dict, List, Set\n", + "\n", + "import pandas as pd\n", + "\n", + "\n", + "class Sequence:\n", + " def __init__(self, sequence: str, name: str, moltype: str = \"dayhoff\"):\n", + " self.sequence = sequence\n", + " self.name = name\n", + " self.moltype = moltype\n", + " self.processed_seq = self._process_sequence()\n", + "\n", + " def _process_sequence(self) -> str:\n", + " return degenerate_protein_chatgpt(self.sequence, self.moltype)\n", + "\n", + " def get_kmers(self, k: int) -> List[str]:\n", + " return [\n", + " self.processed_seq[i : i + k]\n", + " for i in range(len(self.processed_seq) - k + 1)\n", + " ]\n", + "\n", + "\n", + "class KmerAnalyzer:\n", + " def __init__(self, seq1: Sequence, seq2: Sequence):\n", + " self.seq1 = seq1\n", + " self.seq2 = seq2\n", + "\n", + " def calculate_jaccard(self, kmer_set1: Set[str], kmer_set2: Set[str]) -> float:\n", + " intersection = len(kmer_set1.intersection(kmer_set2))\n", + " union = len(kmer_set1.union(kmer_set2))\n", + " return intersection / union if union > 0 else 0.0\n", + "\n", + " def analyze_kmer_range(self, start_k: int, end_k: int) -> pd.DataFrame:\n", + " results = []\n", + " for k in range(start_k, end_k + 1):\n", + " results.append(self._analyze_single_k(k))\n", + " return pd.DataFrame(results, columns=self._get_column_names())\n", + "\n", + " def get_intersecting_kmer_positions(self, kmers1, kmers2, kmer_set1, kmer_set2):\n", + " intersection = kmer_set1.intersection(kmer_set2)\n", + "\n", + " positions1 = []\n", + " positions2 = []\n", + " for kmer in intersection:\n", + " positions1.append([kmers1.index(kmer), kmer])\n", + " positions2.append([kmers2.index(kmer), kmer])\n", + "\n", + " positions1 = sorted(positions1, key=lambda x: x[0])\n", + " positions2 = sorted(positions2, key=lambda x: x[0])\n", + " return positions1, positions2\n", + "\n", + " def _analyze_single_k(self, k: int) -> List:\n", + " kmers1 = self.seq1.get_kmers(k)\n", + " kmers2 = self.seq2.get_kmers(k)\n", + " set1 = set(kmers1)\n", + " set2 = set(kmers2)\n", + "\n", + " pos1, pos2 = self.get_intersecting_kmer_positions(kmers1, kmers2, set1, set2)\n", + "\n", + " return [\n", + " self.seq1.name,\n", + " self.seq2.name,\n", + " self.seq1.moltype,\n", + " k,\n", + " self.calculate_jaccard(set1, set2),\n", + " len(kmers1),\n", + " len(set1),\n", + " pos1,\n", + " len(kmers2),\n", + " len(set2),\n", + " pos2,\n", + " ]\n", + "\n", + " def _get_column_names(self) -> List[str]:\n", + " return [\n", + " \"query\",\n", + " \"match\",\n", + " \"moltype\",\n", + " \"ksize\",\n", + " \"jaccard\",\n", + " \"query_n_kmers\",\n", + " \"query_n_unique_kmers\",\n", + " \"query_intersection_positions\",\n", + " \"match_n_kmers\",\n", + " \"match_n_unique_kmers\",\n", + " \"match_intersection_positions\",\n", + " ]\n", + "\n", + "\n", + "# Example usage\n", + "def compare_sequences(\n", + " seq1_str: str,\n", + " seq2_str: str,\n", + " seq1_name: str,\n", + " seq2_name: str,\n", + " start_k: int = 5,\n", + " end_k: int = 30,\n", + ") -> pd.DataFrame:\n", + " seq1 = Sequence(seq1_str, seq1_name)\n", + " seq2 = Sequence(seq2_str, seq2_name)\n", + " analyzer = KmerAnalyzer(seq1, seq2)\n", + " return analyzer.analyze_kmer_range(start_k, end_k)\n", + "\n", + "\n", + "# Usage example:\n", + "df = compare_sequences(ced9_seq, bcl2_seq, \"CED9\", \"BCL2\", end_k=10)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "16bebea4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[21, 'bbced'],\n", + " [27, 'febed'],\n", + " [28, 'ebedb'],\n", + " [32, 'bbcbb'],\n", + " [46, 'cebbb'],\n", + " [47, 'ebbbb'],\n", + " [48, 'bbbbd'],\n", + " [49, 'bbbdc'],\n", + " [50, 'bbdcb'],\n", + " [94, 'cbecf'],\n", + " [100, 'bbbbe'],\n", + " [101, 'bbbeb'],\n", + " [111, 'cdcee'],\n", + " [137, 'eebeb'],\n", + " [167, 'fbdee'],\n", + " [168, 'bdeeb'],\n", + " [171, 'ebeeb'],\n", + " [177, 'bbfeb'],\n", + " [185, 'ecbec'],\n", + " [217, 'cdbfc'],\n", + " [252, 'beebb'],\n", + " [256, 'bbebb'],\n", + " [262, 'bebee'],\n", + " [264, 'beebe']]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmer5ced9bcl2intersection = df.iloc[0]['query_intersection_positions']\n", + "kmer5ced9bcl2intersection" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "251ce38c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[10, 'cdcee'],\n", + " [24, 'cdbfc'],\n", + " [33, 'cebbb'],\n", + " [34, 'ebbbb'],\n", + " [43, 'bbbbe'],\n", + " [49, 'bbcbb'],\n", + " [58, 'bbbbd'],\n", + " [59, 'bbbdc'],\n", + " [60, 'bbdcb'],\n", + " [82, 'bbbeb'],\n", + " [83, 'bbebb'],\n", + " [115, 'bbced'],\n", + " [139, 'cbecf'],\n", + " [143, 'fbdee'],\n", + " [144, 'bdeeb'],\n", + " [158, 'ecbec'],\n", + " [213, 'febed'],\n", + " [214, 'ebedb'],\n", + " [218, 'beebe'],\n", + " [219, 'eebeb'],\n", + " [221, 'bebee'],\n", + " [222, 'ebeeb'],\n", + " [223, 'beebb'],\n", + " [232, 'bbfeb']]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmer5ced9bcl2match = df.iloc[0]['match_intersection_positions']\n", + "kmer5ced9bcl2match" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "043aff40", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[27, 'febedb'],\n", + " [46, 'cebbbb'],\n", + " [48, 'bbbbdc'],\n", + " [49, 'bbbdcb'],\n", + " [100, 'bbbbeb'],\n", + " [167, 'fbdeeb'],\n", + " [255, 'bbbebb'],\n", + " [262, 'bebeeb']]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmer6ced9bcl2intersection = df.iloc[1]['query_intersection_positions']\n", + "kmer6ced9bcl2intersection" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "85996100", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[33, 'cebbbb'],\n", + " [58, 'bbbbdc'],\n", + " [59, 'bbbdcb'],\n", + " [81, 'bbbbeb'],\n", + " [82, 'bbbebb'],\n", + " [143, 'fbdeeb'],\n", + " [213, 'febedb'],\n", + " [221, 'bebeeb']]" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmer6ced9bcl2match = df.iloc[1]['match_intersection_positions']\n", + "kmer6ced9bcl2match" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "29e43f64-b3f0-4bdd-9286-004d0749ffe9", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:33.206756Z", + "iopub.status.busy": "2024-11-12T21:31:33.206652Z", + "iopub.status.idle": "2024-11-12T21:31:33.237938Z", + "shell.execute_reply": "2024-11-12T21:31:33.237703Z", + "shell.execute_reply.started": "2024-11-12T21:31:33.206746Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
querymatchmoltypeksizejaccardquery_n_kmersquery_n_unique_kmersquery_intersection_positionsmatch_n_kmersmatch_n_unique_kmersmatch_intersection_positions
0CED9BCL2dayhoff50.055046276256[[21, bbced], [27, febed], [28, ebedb], [32, b...235204[[10, cdcee], [24, cdbfc], [33, cebbb], [34, e...
1CED9BCL2dayhoff60.016667275273[[27, febedb], [46, cebbbb], [48, bbbbdc], [49...234215[[33, cebbbb], [58, bbbbdc], [59, bbbdcb], [81...
\n", + "
" + ], + "text/plain": [ + " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n", + "0 CED9 BCL2 dayhoff 5 0.055046 276 256 \n", + "1 CED9 BCL2 dayhoff 6 0.016667 275 273 \n", + "\n", + " query_intersection_positions match_n_kmers \\\n", + "0 [[21, bbced], [27, febed], [28, ebedb], [32, b... 235 \n", + "1 [[27, febedb], [46, cebbbb], [48, bbbbdc], [49... 234 \n", + "\n", + " match_n_unique_kmers match_intersection_positions \n", + "0 204 [[10, cdcee], [24, cdbfc], [33, cebbb], [34, e... \n", + "1 215 [[33, cebbbb], [58, bbbbdc], [59, bbbdcb], [81... " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.query(\"ksize >= 5 and ksize <= 6\")" + ] + }, + { + "cell_type": "markdown", + "id": "bba34ea5-8164-42b3-ba03-054e225b2963", + "metadata": {}, + "source": [ + "### Make Sourmash signatures for CED9, BCL2" + ] + }, + { + "cell_type": "markdown", + "id": "f91b6c5b-254d-4a33-9afe-e1d442a13f65", + "metadata": {}, + "source": [ + "#### CED9" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "cdacb2ab-faa7-45f4-b43b-6e43ff61375f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:54:04.752153Z", + "iopub.status.busy": "2024-11-12T21:54:04.751809Z", + "iopub.status.idle": "2024-11-12T21:54:04.754887Z", + "shell.execute_reply": "2024-11-12T21:54:04.754601Z", + "shell.execute_reply.started": "2024-11-12T21:54:04.752139Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'MTRCTADNSLTNPAYRRRTMATGEMKEFLGIKGTEPTDFGINSDAQDLPSPSRQASTRRMSIGESIDGKINDWEEPRLDIEGFVVDYFTHRIRQNGMEWFGAPGLPCGVQPEHEMMRVMGTIFEKKHAENFETFCEQLLAVPRISFSLYQDVVRTVGNAQTDQCPMSYGRLIGLISFGGFVAAKMMESVELQGQVRNLFVYTSLFIKTRIRNNWKEHNRSWDDFMTLGKQMKEDYERAEAEKVGRRKQNRRWSMIGAGVTAGAIGIVGVVVCGRMMFSLK'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ced9_seq" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "6f770cb4-4c3a-4968-b3ae-27ccede71ecb", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:54:53.753581Z", + "iopub.status.busy": "2024-11-12T21:54:53.753250Z", + "iopub.status.idle": "2024-11-12T21:54:53.756533Z", + "shell.execute_reply": "2024-11-12T21:54:53.756224Z", + "shell.execute_reply.started": "2024-11-12T21:54:53.753567Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting test-data/ced9.fasta\n" + ] + } + ], + "source": [ + "%%file test-data/ced9.fasta\n", + ">ced9\n", + "MTRCTADNSLTNPAYRRRTMATGEMKEFLGIKGTEPTDFGINSDAQDLPSPSRQASTRRMSIGESIDGKINDWEEPRLDIEGFVVDYFTHRIRQNGMEWFGAPGLPCGVQPEHEMMRVMGTIFEKKHAENFETFCEQLLAVPRISFSLYQDVVRTVGNAQTDQCPMSYGRLIGLISFGGFVAAKMMESVELQGQVRNLFVYTSLFIKTRIRNNWKEHNRSWDDFMTLGKQMKEDYERAEAEKVGRRKQNRRWSMIGAGVTAGAIGIVGVVVCGRMMFSLK" + ] + }, + { + "cell_type": "markdown", + "id": "b3a9e2d8-84e2-44a0-a9ec-2f15a41d9340", + "metadata": {}, + "source": [ + "#### P66" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "0b2b11c3-549c-4b50-86cf-46b68830f52e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:54:56.856675Z", + "iopub.status.busy": "2024-11-12T21:54:56.856350Z", + "iopub.status.idle": "2024-11-12T21:54:56.859329Z", + "shell.execute_reply": "2024-11-12T21:54:56.859070Z", + "shell.execute_reply.started": "2024-11-12T21:54:56.856662Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAPGIFSSQPGHTPHPAASRDPVARTSPLQTPAAPGAAAGPALSPVPPVVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYGPSMRPLFDFSWLSLKTLLSLALVGACITLGAYLGHK'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bcl2_seq" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "0a0eb7e6-7cf0-473c-a3b9-651bdfa074eb", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:55:06.101451Z", + "iopub.status.busy": "2024-11-12T21:55:06.101120Z", + "iopub.status.idle": "2024-11-12T21:55:06.104798Z", + "shell.execute_reply": "2024-11-12T21:55:06.104500Z", + "shell.execute_reply.started": "2024-11-12T21:55:06.101437Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting test-data/bcl2.fasta\n" + ] + } + ], + "source": [ + "%%file test-data/bcl2.fasta\n", + ">bcl2\n", + "MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAPGIFSSQPGHTPHPAASRDPVARTSPLQTPAAPGAAAGPALSPVPPVVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYGPSMRPLFDFSWLSLKTLLSLALVGACITLGAYLGHK" + ] + }, + { + "cell_type": "markdown", + "id": "b43fab68-3588-492d-9e7b-f8fab806725a", + "metadata": {}, + "source": [ + "### Compute signatures" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "52d8bf59-8e38-4578-825d-82e780b6a513", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:55:37.747266Z", + "iopub.status.busy": "2024-11-12T21:55:37.746938Z", + "iopub.status.idle": "2024-11-12T21:55:39.944359Z", + "shell.execute_reply": "2024-11-12T21:55:39.943940Z", + "shell.execute_reply.started": "2024-11-12T21:55:37.747252Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "\u001b[K\n", + "== This is sourmash version 4.8.12. ==\n", + "\n", + "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n", + "\n", + "\n", + "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n", + "\n", + "\u001b[Kcomputing signatures for files: test-data/ced9.fasta\n", + "\n", + "\u001b[KComputing a total of 1 signature(s) for each input.\n", + "\n", + "\u001b[K... reading sequences from test-data/ced9.fasta\n", + "\n", + "\u001b[K... test-data/ced9.fasta 1 sequences\n", + "\n", + "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n", + "\n", + "\u001b[Ksaved 1 signature(s) to 'test-data/ced9.dayhoff.k5.sig'\n", + "\n", + "\u001b[K\n", + "== This is sourmash version 4.8.12. ==\n", + "\n", + "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n", + "\n", + "\n", + "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n", + "\n", + "\u001b[Kcomputing signatures for files: test-data/bcl2.fasta\n", + "\n", + "\u001b[KComputing a total of 1 signature(s) for each input.\n", + "\n", + "\u001b[K... reading sequences from test-data/bcl2.fasta\n", + "\n", + "\u001b[K... test-data/bcl2.fasta 1 sequences\n", + "\n", + "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n", + "\n", + "\u001b[Ksaved 1 signature(s) to 'test-data/bcl2.dayhoff.k5.sig'\n", + "\n", + "\u001b[K\n", + "== This is sourmash version 4.8.12. ==\n", + "\n", + "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n", + "\n", + "\n", + "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n", + "\n", + "\u001b[Kcomputing signatures for files: test-data/ced9.fasta\n", + "\n", + "\u001b[KComputing a total of 1 signature(s) for each input.\n", + "\n", + "\u001b[K... reading sequences from test-data/ced9.fasta\n", + "\n", + "\u001b[K... test-data/ced9.fasta 1 sequences\n", + "\n", + "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n", + "\n", + "\u001b[Ksaved 1 signature(s) to 'test-data/ced9.dayhoff.k6.sig'\n", + "\n", + "\u001b[K\n", + "== This is sourmash version 4.8.12. ==\n", + "\n", + "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n", + "\n", + "\n", + "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n", + "\n", + "\u001b[Kcomputing signatures for files: test-data/bcl2.fasta\n", + "\n", + "\u001b[KComputing a total of 1 signature(s) for each input.\n", + "\n", + "\u001b[K... reading sequences from test-data/bcl2.fasta\n", + "\n", + "\u001b[K... test-data/bcl2.fasta 1 sequences\n", + "\n", + "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n", + "\n", + "\u001b[Ksaved 1 signature(s) to 'test-data/bcl2.dayhoff.k6.sig'\n" + ] + } + ], + "source": [ + "ksizes = 5, 6\n", + "ced9_bcl2_fastas = {\"ced9\": \"test-data/ced9.fasta\", \"bcl2\": \"test-data/bcl2.fasta\"}\n", + "ced9_bcl2_sigfiles = {}\n", + "\n", + "\n", + "for ksize in ksizes:\n", + " for name, fasta in ced9_bcl2_fastas.items():\n", + " param_string = f\"dayhoff,scaled=1,k={ksize}\"\n", + " sig = f\"test-data/{name}.dayhoff.k{ksize}.sig\"\n", + " ! sourmash sketch protein -p $param_string --name $name $fasta -o $sig\n", + " ced9_bcl2_sigfiles[(name, ksize)] = sig" + ] + }, + { + "cell_type": "markdown", + "id": "39fa469c-4799-4c64-b683-816ae26948ca", + "metadata": {}, + "source": [ + "### Load Signatures" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "953365a4-f46e-45ae-9fb9-944c1af5e43b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:55:57.556563Z", + "iopub.status.busy": "2024-11-12T21:55:57.556065Z", + "iopub.status.idle": "2024-11-12T21:55:57.563257Z", + "shell.execute_reply": "2024-11-12T21:55:57.563020Z", + "shell.execute_reply.started": "2024-11-12T21:55:57.556546Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{5: {'ced9': SourmashSignature('ced9', 1dd6b6f2),\n", + " 'bcl2': SourmashSignature('bcl2', 49f32c24)},\n", + " 6: {'ced9': SourmashSignature('ced9', 1dd6b6f2),\n", + " 'bcl2': SourmashSignature('bcl2', 49f32c24)}}" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ksizes = 5, 6\n", + "ced9_bcl2_sigs = dict.fromkeys(ksizes, {})\n", + "\n", + "for (name, ksize), sigfile in ced9_bcl2_sigfiles.items():\n", + " ced9_bcl2_sigs[ksize][name] = list(sourmash.load_file_as_signatures(sigfile))[0]\n", + "ced9_bcl2_sigs" + ] + }, + { + "cell_type": "markdown", + "id": "2795adfb-6bbd-47ef-8bd4-a22a22129813", + "metadata": {}, + "source": [ + "### Show SigSeq Alignment" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "12131870-92f9-42cd-9b76-842dd2b429aa", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:56:30.609180Z", + "iopub.status.busy": "2024-11-12T21:56:30.608857Z", + "iopub.status.idle": "2024-11-12T21:56:30.618914Z", + "shell.execute_reply": "2024-11-12T21:56:30.618645Z", + "shell.execute_reply.started": "2024-11-12T21:56:30.609167Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ksize: 8\n" + ] + }, + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[20], line 10\u001b[0m\n\u001b[0;32m 7\u001b[0m bcl2_sigseq \u001b[38;5;241m=\u001b[39m SigSeq(sigs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbcl2\u001b[39m\u001b[38;5;124m\"\u001b[39m], bcl2_seq)\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 10\u001b[0m \u001b[43mced9_sigseq\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdisplay_alignment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbcl2_sigseq\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28mprint\u001b[39m(e)\n", + "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:244\u001b[0m, in \u001b[0;36mSigSeq.display_alignment\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 242\u001b[0m \u001b[38;5;124;03m\"\"\"Displays the alignment between two sequences\"\"\"\u001b[39;00m\n\u001b[0;32m 243\u001b[0m \u001b[38;5;66;03m# Compute overlaps\u001b[39;00m\n\u001b[1;32m--> 244\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_overlap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 245\u001b[0m other\u001b[38;5;241m.\u001b[39mcompute_overlap(\u001b[38;5;28mself\u001b[39m)\n\u001b[0;32m 247\u001b[0m \u001b[38;5;66;03m# Verify overlaps\u001b[39;00m\n", + "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:194\u001b[0m, in \u001b[0;36mSigSeq.compute_overlap\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 191\u001b[0m overlap_encoded \u001b[38;5;241m=\u001b[39m KmerStitcher\u001b[38;5;241m.\u001b[39mstitch_kmers(overlap, use_encoded\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 193\u001b[0m index \u001b[38;5;241m=\u001b[39m other\u001b[38;5;241m.\u001b[39mseq\u001b[38;5;241m.\u001b[39mindex(overlap_seq)\n\u001b[1;32m--> 194\u001b[0m overlap_length \u001b[38;5;241m=\u001b[39m \u001b[43moverlap\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m-\u001b[39m overlap[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msig\u001b[38;5;241m.\u001b[39mminhash\u001b[38;5;241m.\u001b[39mksize\n\u001b[0;32m 196\u001b[0m \u001b[38;5;66;03m# Since the overlapping k-mers were originally in 'other', not 'self' -> assign to 'other'\u001b[39;00m\n\u001b[0;32m 197\u001b[0m \u001b[38;5;66;03m# If we had used 'self', then would have returned ALL k-mers since they are all present in self\u001b[39;00m\n\u001b[0;32m 198\u001b[0m other\u001b[38;5;241m.\u001b[39moverlap \u001b[38;5;241m=\u001b[39m OverlapInfo(index, overlap_seq, overlap_encoded, overlap_length)\n", + "\u001b[1;31mIndexError\u001b[0m: list index out of range" + ] + } + ], + "source": [ + "from sigseq import SigSeq\n", + "\n", + "for ksize, sigs in ced9_bcl2_sigs.items():\n", + " print(f\"ksize: {ksize}\")\n", + "\n", + " ced9_sigseq = SigSeq(sigs[\"ced9\"], ced9_seq)\n", + " bcl2_sigseq = SigSeq(sigs[\"bcl2\"], bcl2_seq)\n", + "\n", + " try:\n", + " ced9_sigseq.display_alignment(bcl2_sigseq)\n", + " except ValueError as e:\n", + " print(e)\n", + " continue" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "c422de6c", + "metadata": {}, + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[21], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mced9_sigseq\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdisplay_alignment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbcl2_sigseq\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:244\u001b[0m, in \u001b[0;36mSigSeq.display_alignment\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 242\u001b[0m \u001b[38;5;124;03m\"\"\"Displays the alignment between two sequences\"\"\"\u001b[39;00m\n\u001b[0;32m 243\u001b[0m \u001b[38;5;66;03m# Compute overlaps\u001b[39;00m\n\u001b[1;32m--> 244\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_overlap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 245\u001b[0m other\u001b[38;5;241m.\u001b[39mcompute_overlap(\u001b[38;5;28mself\u001b[39m)\n\u001b[0;32m 247\u001b[0m \u001b[38;5;66;03m# Verify overlaps\u001b[39;00m\n", + "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:194\u001b[0m, in \u001b[0;36mSigSeq.compute_overlap\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 191\u001b[0m overlap_encoded \u001b[38;5;241m=\u001b[39m KmerStitcher\u001b[38;5;241m.\u001b[39mstitch_kmers(overlap, use_encoded\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 193\u001b[0m index \u001b[38;5;241m=\u001b[39m other\u001b[38;5;241m.\u001b[39mseq\u001b[38;5;241m.\u001b[39mindex(overlap_seq)\n\u001b[1;32m--> 194\u001b[0m overlap_length \u001b[38;5;241m=\u001b[39m \u001b[43moverlap\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m-\u001b[39m overlap[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msig\u001b[38;5;241m.\u001b[39mminhash\u001b[38;5;241m.\u001b[39mksize\n\u001b[0;32m 196\u001b[0m \u001b[38;5;66;03m# Since the overlapping k-mers were originally in 'other', not 'self' -> assign to 'other'\u001b[39;00m\n\u001b[0;32m 197\u001b[0m \u001b[38;5;66;03m# If we had used 'self', then would have returned ALL k-mers since they are all present in self\u001b[39;00m\n\u001b[0;32m 198\u001b[0m other\u001b[38;5;241m.\u001b[39moverlap \u001b[38;5;241m=\u001b[39m OverlapInfo(index, overlap_seq, overlap_encoded, overlap_length)\n", + "\u001b[1;31mIndexError\u001b[0m: list index out of range" + ] + } + ], + "source": [ + "ced9_sigseq.display_alignment(bcl2_sigseq)" + ] + }, + { + "cell_type": "markdown", + "id": "66214b65-33c1-4594-836d-0df045a8eb04", + "metadata": {}, + "source": [ + "### " + ] + }, + { + "cell_type": "markdown", + "id": "e4cef2f1-dd57-491b-993c-24b85415af46", + "metadata": {}, + "source": [ + "\n", + "## P66 and CD47\n", + "\n", + "From [P66 is a bacterial mimic of CD47 that binds the anti-phagocytic receptor SIRPα and facilitates macrophage evasion by Borrelia burgdorferi](https://www.biorxiv.org/content/10.1101/2024.04.29.591704v1.full)\n", + "\n", + "> Protein alignments were performed through Uniprot (www.uniprot.org) using the Clustal Omega Program23,24. The following proteins were used for analysis: CD47_HUMAN (Q08722), and H7C7N8(P66)_BORBU (H7C7N8).\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "bdb27e70-40c1-4aa8-981d-3aa234a5496b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:34.148120Z", + "iopub.status.busy": "2024-11-12T21:31:34.147864Z", + "iopub.status.idle": "2024-11-12T21:31:34.150240Z", + "shell.execute_reply": "2024-11-12T21:31:34.149995Z", + "shell.execute_reply.started": "2024-11-12T21:31:34.148107Z" + } + }, + "outputs": [], + "source": [ + "CD47_HUMAN = \"MWPLVAALLLGSACCGSAQLLFNKTKSVEFTFCNDTVVIPCFVTNMEAQNTTEVYVKWKFKGRDIYTFDGALNKSTVPTDFSSAKIEVSQLLKGDASLKMDKSDAVSHTGNYTCEVTELTREGETIIELKYRVVSWFSPNENILIVIFPIFAILLFWGQFGIKTLKYRSGGMDEKTIALLVAGLVITVIVIVGAILFVPGEYSLKNATGLGLIVTSTGILILLHYYVFSTAIGLTSFVIAILVIQVIAYILAVVGLSLCIAACIPMHGPLLISGLSILALAQLLGLVYMKFVASNQKTIQPPRKAVEEPLNAFKESKGMMNDE\"" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "fe826dbe-334f-4038-8ef1-9682455e28f7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:34.345613Z", + "iopub.status.busy": "2024-11-12T21:31:34.345364Z", + "iopub.status.idle": "2024-11-12T21:31:34.347806Z", + "shell.execute_reply": "2024-11-12T21:31:34.347546Z", + "shell.execute_reply.started": "2024-11-12T21:31:34.345600Z" + } + }, + "outputs": [], + "source": [ + "H7C7N8_BORBU = \"MKSHILYKLIIFLTTSAAIFAADALKEKDIFKINPWMPTFGFENTSEFRLDMDELVPGFENKSKITIKLKPFEANPELGKDDPFSAYIKVEDLALKAEGKKGDQFKIDVGDITAQINMYDFFIKISTMTDFDFNKESLFSFAPMTGFKSTYYGFPSNDRAVRGTILARGTSKNIGTIQLGYKLPKLDLTFAIGGTGTGNRNQENDKDTPYNKTYQGILYGIQATWKPIKNLLDQNEDTKSVIAETPFELNFGLSGAYGNETFNNSSITYSLKDKSVVGNDLLSPTLSNSAILASFGAKYKLGLTKINDKNTYLILQMGTDFGIDPFASDFSIFGHISKAANFKKETPSDPNKKAEIFDPNGNALNFSKNTELGIAFSTGASIGFAWNKDTGEKESWAIKGSDSYSTRLFGEQDKKSGVALGISYGQNLYRSKDTEKRLKTISENAFQSLNVEISSYEDNKKGIINGLGWITSIGLYDILRQKSVENYPTTISSTTENNQTEQSSTSTKTTTPNLTFEDAMKLGLALYLDYAIPIASISTEAYVVPYIGAYILGPSNKLSSDATKIYLKTGLSLEKLIRFTTISLGWDSNNIIELANKNTNNAAIGSAFLQFKIAYSGS\"" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ad498f99-a8c2-403f-b827-c6065c02725a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:34.529424Z", + "iopub.status.busy": "2024-11-12T21:31:34.529059Z", + "iopub.status.idle": "2024-11-12T21:31:34.653853Z", + "shell.execute_reply": "2024-11-12T21:31:34.653567Z", + "shell.execute_reply.started": "2024-11-12T21:31:34.529412Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
querymatchmoltypeksizejaccardquery_n_kmersquery_n_unique_kmersquery_intersection_positionsmatch_n_kmersmatch_n_unique_kmersmatch_intersection_positions
0CD47p66dayhoff50.098266319269[[2, beebb], [8, eebbb], [15, bbbce], [26, bec...614491[[9, eefeb], [10, efebb], [12, ebbbb], [14, bb...
1CD47p66dayhoff60.026838318294[[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83...613586[[9, eefebb], [75, bcebdc], [105, decebc], [10...
2CD47p66dayhoff70.006579317306[[84, decebce], [105, ebdbbcf], [197, ebbcfbe]...612612[[105, decebce], [154, bbccdbe], [159, bedbbee...
3CD47p66dayhoff80.000000316314[]611611[]
4CD47p66dayhoff90.000000315315[]610610[]
5CD47p66dayhoff100.000000314314[]609609[]
\n", + "
" + ], + "text/plain": [ + " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n", + "0 CD47 p66 dayhoff 5 0.098266 319 269 \n", + "1 CD47 p66 dayhoff 6 0.026838 318 294 \n", + "2 CD47 p66 dayhoff 7 0.006579 317 306 \n", + "3 CD47 p66 dayhoff 8 0.000000 316 314 \n", + "4 CD47 p66 dayhoff 9 0.000000 315 315 \n", + "5 CD47 p66 dayhoff 10 0.000000 314 314 \n", + "\n", + " query_intersection_positions match_n_kmers \\\n", + "0 [[2, beebb], [8, eebbb], [15, bbbce], [26, bec... 614 \n", + "1 [[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83... 613 \n", + "2 [[84, decebce], [105, ebdbbcf], [197, ebbcfbe]... 612 \n", + "3 [] 611 \n", + "4 [] 610 \n", + "5 [] 609 \n", + "\n", + " match_n_unique_kmers match_intersection_positions \n", + "0 491 [[9, eefeb], [10, efebb], [12, ebbbb], [14, bb... \n", + "1 586 [[9, eefebb], [75, bcebdc], [105, decebc], [10... \n", + "2 612 [[105, decebce], [154, bbccdbe], [159, bedbbee... \n", + "3 611 [] \n", + "4 610 [] \n", + "5 609 [] " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p66_cd47_df = compare_sequences(CD47_HUMAN, H7C7N8_BORBU, \"CD47\", \"p66\",end_k=10)\n", + "p66_cd47_df" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "5e162f4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[21, 'bbced'],\n", + " [27, 'febed'],\n", + " [28, 'ebedb'],\n", + " [32, 'bbcbb'],\n", + " [46, 'cebbb'],\n", + " [47, 'ebbbb'],\n", + " [48, 'bbbbd'],\n", + " [49, 'bbbdc'],\n", + " [50, 'bbdcb'],\n", + " [94, 'cbecf'],\n", + " [100, 'bbbbe'],\n", + " [101, 'bbbeb'],\n", + " [111, 'cdcee'],\n", + " [137, 'eebeb'],\n", + " [167, 'fbdee'],\n", + " [168, 'bdeeb'],\n", + " [171, 'ebeeb'],\n", + " [177, 'bbfeb'],\n", + " [185, 'ecbec'],\n", + " [217, 'cdbfc'],\n", + " [252, 'beebb'],\n", + " [256, 'bbebb'],\n", + " [262, 'bebee'],\n", + " [264, 'beebe']]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmer5p66cd47intersection = df.iloc[0]['query_intersection_positions']\n", + "kmer5p66cd47intersection" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "451bc42d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[10, 'cdcee'],\n", + " [24, 'cdbfc'],\n", + " [33, 'cebbb'],\n", + " [34, 'ebbbb'],\n", + " [43, 'bbbbe'],\n", + " [49, 'bbcbb'],\n", + " [58, 'bbbbd'],\n", + " [59, 'bbbdc'],\n", + " [60, 'bbdcb'],\n", + " [82, 'bbbeb'],\n", + " [83, 'bbebb'],\n", + " [115, 'bbced'],\n", + " [139, 'cbecf'],\n", + " [143, 'fbdee'],\n", + " [144, 'bdeeb'],\n", + " [158, 'ecbec'],\n", + " [213, 'febed'],\n", + " [214, 'ebedb'],\n", + " [218, 'beebe'],\n", + " [219, 'eebeb'],\n", + " [221, 'bebee'],\n", + " [222, 'ebeeb'],\n", + " [223, 'beebb'],\n", + " [232, 'bbfeb']]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmer5p66cd47match = df.iloc[0]['match_intersection_positions']\n", + "kmer5p66cd47match" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "dc528452", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[27, 'febedb'],\n", + " [46, 'cebbbb'],\n", + " [48, 'bbbbdc'],\n", + " [49, 'bbbdcb'],\n", + " [100, 'bbbbeb'],\n", + " [167, 'fbdeeb'],\n", + " [255, 'bbbebb'],\n", + " [262, 'bebeeb']]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmer6p66cd47intersection = df.iloc[1]['query_intersection_positions']\n", + "kmer6p66cd47intersection" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "55df9052", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[33, 'cebbbb'],\n", + " [58, 'bbbbdc'],\n", + " [59, 'bbbdcb'],\n", + " [81, 'bbbbeb'],\n", + " [82, 'bbbebb'],\n", + " [143, 'fbdeeb'],\n", + " [213, 'febedb'],\n", + " [221, 'bebeeb']]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmer6p66cd47match = df.iloc[1]['match_intersection_positions']\n", + "kmer6p66cd47match" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42f65950", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b827bc3f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ade0f902", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0f17735", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "4f26d57c-29bc-4c04-8dd8-48fab6bf5fb3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-11T18:37:59.846385Z", + "iopub.status.busy": "2024-11-11T18:37:59.846101Z", + "iopub.status.idle": "2024-11-11T18:37:59.849548Z", + "shell.execute_reply": "2024-11-11T18:37:59.849227Z", + "shell.execute_reply.started": "2024-11-11T18:37:59.846369Z" + } + }, + "source": [ + "\n", + "### Q: Where does P66 bind CD47? A: 181-187aa\n", + "\n", + "> Utilizing a p66-deficient B. burgdorferi strain of B31-A3 (Δp66) we determined that P66 is required for CV1-G4 surface binding (Figure 2A). We next sought to determine residues on P66 critical for SIRPɑ interaction. We have previously demonstrated that two aspartic acid residues, D184 and D186, on a predicted extracellular loop of P66 (181–187) are required for integrin binding19. B. burgdorferi expressing the mutant D184A and D186A, p66D184A,D186A, or loss of the loop, p66Δ181−187, demonstrated loss of CV1-G4 binding (Figure 2A). Consistent to previous structure predictions, these sites map to an unstructured extracellular loop on a structure of P66 generated by Alphafold2 (Figure 2B and Extended Data 2A). We postulate this region is also required for SIRPɑ binding. Importantly, while these residues are critical for binding integrins and P66, loss of this loop or mutation of the two aspartic acid residues does not affect P66 cell surface localization19.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e085b44e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "b4a2745a-b8e5-4d42-9df3-261775255ad6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:34.921046Z", + "iopub.status.busy": "2024-11-12T21:31:34.920724Z", + "iopub.status.idle": "2024-11-12T21:31:34.924464Z", + "shell.execute_reply": "2024-11-12T21:31:34.924221Z", + "shell.execute_reply.started": "2024-11-12T21:31:34.921033Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[True,\n", + " True,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False,\n", + " False]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p66_binding_positions = set(range(181, 188))\n", + "\n", + "overlapping_p66_cd47_mimickry = [\n", + " True if any(i in p66_binding_positions for i, kmer in positions) else False\n", + " for positions in p66_cd47_df.match_intersection_positions\n", + "]\n", + "overlapping_p66_cd47_mimickry" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "8aa13116-0402-4764-803e-76119148babc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:35.503641Z", + "iopub.status.busy": "2024-11-12T21:31:35.503391Z", + "iopub.status.idle": "2024-11-12T21:31:35.539428Z", + "shell.execute_reply": "2024-11-12T21:31:35.539170Z", + "shell.execute_reply.started": "2024-11-12T21:31:35.503628Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
querymatchmoltypeksizejaccardquery_n_kmersquery_n_unique_kmersquery_intersection_positionsmatch_n_kmersmatch_n_unique_kmersmatch_intersection_positions
0CD47p66dayhoff50.098266319269[[2, beebb], [8, eebbb], [15, bbbce], [26, bec...614491[[9, eefeb], [10, efebb], [12, ebbbb], [14, bb...
1CD47p66dayhoff60.026838318294[[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83...613586[[9, eefebb], [75, bcebdc], [105, decebc], [10...
\n", + "
" + ], + "text/plain": [ + " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n", + "0 CD47 p66 dayhoff 5 0.098266 319 269 \n", + "1 CD47 p66 dayhoff 6 0.026838 318 294 \n", + "\n", + " query_intersection_positions match_n_kmers \\\n", + "0 [[2, beebb], [8, eebbb], [15, bbbce], [26, bec... 614 \n", + "1 [[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83... 613 \n", + "\n", + " match_n_unique_kmers match_intersection_positions \n", + "0 491 [[9, eefeb], [10, efebb], [12, ebbbb], [14, bb... \n", + "1 586 [[9, eefebb], [75, bcebdc], [105, decebc], [10... " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p66_cd47_df.loc[overlapping_p66_cd47_mimickry]" + ] + }, + { + "cell_type": "markdown", + "id": "63e5771c-e1f5-452a-8417-1ba9cc8164d6", + "metadata": {}, + "source": [ + "### Make Sourmash signatures for P66, CD47" + ] + }, + { + "cell_type": "markdown", + "id": "762ae606-f888-4a85-97a6-0b45e61634e1", + "metadata": {}, + "source": [ + "#### CD47" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "dd8f56ea-bf46-42b5-a4ce-58916b1212f3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:36.388801Z", + "iopub.status.busy": "2024-11-12T21:31:36.388484Z", + "iopub.status.idle": "2024-11-12T21:31:36.391508Z", + "shell.execute_reply": "2024-11-12T21:31:36.391259Z", + "shell.execute_reply.started": "2024-11-12T21:31:36.388788Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting test-data/cd47.fasta\n" + ] + } + ], + "source": [ + "%%file test-data/cd47.fasta\n", + ">CD47_HUMAN\n", + "MWPLVAALLLGSACCGSAQLLFNKTKSVEFTFCNDTVVIPCFVTNMEAQNTTEVYVKWKFKGRDIYTFDGALNKSTVPTDFSSAKIEVSQLLKGDASLKMDKSDAVSHTGNYTCEVTELTREGETIIELKYRVVSWFSPNENILIVIFPIFAILLFWGQFGIKTLKYRSGGMDEKTIALLVAGLVITVIVIVGAILFVPGEYSLKNATGLGLIVTSTGILILLHYYVFSTAIGLTSFVIAILVIQVIAYILAVVGLSLCIAACIPMHGPLLISGLSILALAQLLGLVYMKFVASNQKTIQPPRKAVEEPLNAFKESKGMMNDE" + ] + }, + { + "cell_type": "markdown", + "id": "655714be-f3f0-40ac-8641-96eae10c6e2e", + "metadata": {}, + "source": [ + "#### P66" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "1aaff219-c98e-4c5d-b057-534703f64485", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:31:37.163245Z", + "iopub.status.busy": "2024-11-12T21:31:37.162832Z", + "iopub.status.idle": "2024-11-12T21:31:37.165660Z", + "shell.execute_reply": "2024-11-12T21:31:37.165403Z", + "shell.execute_reply.started": "2024-11-12T21:31:37.163232Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting test-data/p66.fasta\n" + ] + } + ], + "source": [ + "%%file test-data/p66.fasta\n", + ">P66_H7C7N8_BORBU\n", + "MKSHILYKLIIFLTTSAAIFAADALKEKDIFKINPWMPTFGFENTSEFRLDMDELVPGFENKSKITIKLKPFEANPELGKDDPFSAYIKVEDLALKAEGKKGDQFKIDVGDITAQINMYDFFIKISTMTDFDFNKESLFSFAPMTGFKSTYYGFPSNDRAVRGTILARGTSKNIGTIQLGYKLPKLDLTFAIGGTGTGNRNQENDKDTPYNKTYQGILYGIQATWKPIKNLLDQNEDTKSVIAETPFELNFGLSGAYGNETFNNSSITYSLKDKSVVGNDLLSPTLSNSAILASFGAKYKLGLTKINDKNTYLILQMGTDFGIDPFASDFSIFGHISKAANFKKETPSDPNKKAEIFDPNGNALNFSKNTELGIAFSTGASIGFAWNKDTGEKESWAIKGSDSYSTRLFGEQDKKSGVALGISYGQNLYRSKDTEKRLKTISENAFQSLNVEISSYEDNKKGIINGLGWITSIGLYDILRQKSVENYPTTISSTTENNQTEQSSTSTKTTTPNLTFEDAMKLGLALYLDYAIPIASISTEAYVVPYIGAYILGPSNKLSSDATKIYLKTGLSLEKLIRFTTISLGWDSNNIIELANKNTNNAAIGSAFLQFKIAYSGS" + ] + }, + { + "cell_type": "markdown", + "id": "c2986b10-c40f-407d-ba7e-c221db770da2", + "metadata": {}, + "source": [ + "### Compute signatures" + ] + }, + { + "cell_type": "markdown", + "id": "b150efd8", + "metadata": {}, + "source": [ + "# This value was dropped from ksizes = 8, 9 in the hp dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "ca31427f-04bd-4304-b3a4-b9f28e976760", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:35:43.660315Z", + "iopub.status.busy": "2024-11-12T21:35:43.659921Z", + "iopub.status.idle": "2024-11-12T21:35:45.842797Z", + "shell.execute_reply": "2024-11-12T21:35:45.842366Z", + "shell.execute_reply.started": "2024-11-12T21:35:43.660302Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "\u001b[K\n", + "== This is sourmash version 4.8.12. ==\n", + "\n", + "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n", + "\n", + "\n", + "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n", + "\n", + "\u001b[Kcomputing signatures for files: test-data/p66.fasta\n", + "\n", + "\u001b[KComputing a total of 1 signature(s) for each input.\n", + "\n", + "\u001b[K... reading sequences from test-data/p66.fasta\n", + "\n", + "\u001b[K... test-data/p66.fasta 1 sequences\n", + "\n", + "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n", + "\n", + "\u001b[Ksaved 1 signature(s) to 'test-data/p66.dayhoff.k3.sig'\n", + "\n", + "\u001b[K\n", + "== This is sourmash version 4.8.12. ==\n", + "\n", + "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n", + "\n", + "\n", + "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n", + "\n", + "\u001b[Kcomputing signatures for files: test-data/cd47.fasta\n", + "\n", + "\u001b[KComputing a total of 1 signature(s) for each input.\n", + "\n", + "\u001b[K... reading sequences from test-data/cd47.fasta\n", + "\n", + "\u001b[K... test-data/cd47.fasta 1 sequences\n", + "\n", + "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n", + "\n", + "\u001b[Ksaved 1 signature(s) to 'test-data/cd47.dayhoff.k3.sig'\n", + "\n", + "\u001b[K\n", + "== This is sourmash version 4.8.12. ==\n", + "\n", + "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n", + "\n", + "\n", + "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n", + "\n", + "\u001b[Kcomputing signatures for files: test-data/p66.fasta\n", + "\n", + "\u001b[KComputing a total of 1 signature(s) for each input.\n", + "\n", + "\u001b[K... reading sequences from test-data/p66.fasta\n", + "\n", + "\u001b[K... test-data/p66.fasta 1 sequences\n", + "\n", + "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n", + "\n", + "\u001b[Ksaved 1 signature(s) to 'test-data/p66.dayhoff.k4.sig'\n", + "\n", + "\u001b[K\n", + "== This is sourmash version 4.8.12. ==\n", + "\n", + "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n", + "\n", + "\n", + "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n", + "\n", + "\u001b[Kcomputing signatures for files: test-data/cd47.fasta\n", + "\n", + "\u001b[KComputing a total of 1 signature(s) for each input.\n", + "\n", + "\u001b[K... reading sequences from test-data/cd47.fasta\n", + "\n", + "\u001b[K... test-data/cd47.fasta 1 sequences\n", + "\n", + "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n", + "\n", + "\u001b[Ksaved 1 signature(s) to 'test-data/cd47.dayhoff.k4.sig'\n" + ] + } + ], + "source": [ + "ksizes = 3, 4\n", + "p66_cd47_fastas = {\"p66\": \"test-data/p66.fasta\", \"cd47\": \"test-data/cd47.fasta\"}\n", + "p66_cd47_sigfiles = {}\n", + "\n", + "\n", + "for ksize in ksizes:\n", + " for name, fasta in p66_cd47_fastas.items():\n", + " param_string = f\"dayhoff,scaled=1,k={ksize}\"\n", + " sig = f\"test-data/{name}.dayhoff.k{ksize}.sig\"\n", + " ! sourmash sketch protein -p $param_string --name $name $fasta -o $sig\n", + " p66_cd47_sigfiles[(name, ksize)] = sig" + ] + }, + { + "cell_type": "markdown", + "id": "6274314c-56e2-4041-86ea-8d252f50e35e", + "metadata": {}, + "source": [ + "### Load Signatures" + ] + }, + { + "cell_type": "markdown", + "id": "510e0d98", + "metadata": {}, + "source": [ + "# This value was dropped from ksizes = 8, 9 in the hp dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "id": "90f21140-c447-4880-8aa8-7ef2cbc8f917", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:35:47.715555Z", + "iopub.status.busy": "2024-11-12T21:35:47.715175Z", + "iopub.status.idle": "2024-11-12T21:35:47.721686Z", + "shell.execute_reply": "2024-11-12T21:35:47.721466Z", + "shell.execute_reply.started": "2024-11-12T21:35:47.715538Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{3: {'p66': SourmashSignature('p66', 06f1c759),\n", + " 'cd47': SourmashSignature('cd47', bafcd8ea)},\n", + " 4: {'p66': SourmashSignature('p66', 06f1c759),\n", + " 'cd47': SourmashSignature('cd47', bafcd8ea)}}" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ksizes = 3, 4\n", + "p66_cd47_sigs = dict.fromkeys(ksizes, {})\n", + "\n", + "for (name, ksize), sigfile in p66_cd47_sigfiles.items():\n", + " p66_cd47_sigs[ksize][name] = list(sourmash.load_file_as_signatures(sigfile))[0]\n", + "p66_cd47_sigs" + ] + }, + { + "cell_type": "markdown", + "id": "a46a7223-9c07-4d04-96cd-946e263c2cfb", + "metadata": {}, + "source": [ + "### SHow SigSeq Alignment" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "id": "15a5a246-9fe3-4db8-afd1-1bd140c62918", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:56:40.338811Z", + "iopub.status.busy": "2024-11-12T21:56:40.338426Z", + "iopub.status.idle": "2024-11-12T21:56:40.355520Z", + "shell.execute_reply": "2024-11-12T21:56:40.355182Z", + "shell.execute_reply.started": "2024-11-12T21:56:40.338798Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ksize: 3\n", + "Non-sequential indices -- Previous: 0, current: 2\n", + "ksize: 4\n", + "Non-sequential indices -- Previous: 0, current: 2\n" + ] + } + ], + "source": [ + "from sigseq import SigSeq\n", + "\n", + "for ksize, sigs in p66_cd47_sigs.items():\n", + " print(f\"ksize: {ksize}\")\n", + "\n", + " p66_sigseq = SigSeq(sigs[\"p66\"], H7C7N8_BORBU)\n", + " cd47_sigseq = SigSeq(sigs[\"cd47\"], CD47_HUMAN)\n", + "\n", + " try:\n", + " p66_sigseq.display_alignment(cd47_sigseq)\n", + " except ValueError as e:\n", + " print(e)\n", + " continue" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "id": "00497e2e", + "metadata": {}, + "outputs": [], + "source": [ + "from sigseq import KmerStitcher, FastaHeaderHighlighter\n", + "import sigseq" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "id": "4fefdefa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(6, 'fdee', 'YKLI', 10068566436590771491),\n", + " (8, 'eeef', 'LIIF', 224116582751336810),\n", + " (9, 'eefe', 'IIFL', 7860449462917742889),\n", + " (10, 'efeb', 'IFLT', 4466143379625637480),\n", + " (11, 'febb', 'FLTT', 22856495822854259),\n", + " (12, 'ebbb', 'LTTS', 6489788892270137876),\n", + " (13, 'bbbb', 'TTSA', 10636871753394754362),\n", + " (14, 'bbbb', 'TSAA', 10636871753394754362),\n", + " (15, 'bbbe', 'SAAI', 5844270199162658202),\n", + " (17, 'befb', 'AIFA', 12608447144368280135),\n", + " (18, 'efbb', 'IFAA', 6737117628605503971),\n", + " (19, 'fbbc', 'FAAD', 15487073495104592163),\n", + " (21, 'bcbe', 'ADAL', 2582718285239245651),\n", + " (23, 'bedc', 'ALKE', 9924122599953948134),\n", + " (27, 'dcef', 'KDIF', 13066860520591833755),\n", + " (32, 'ecbf', 'INPW', 7824081998478575779),\n", + " (35, 'febb', 'WMPT', 22856495822854259),\n", + " (36, 'ebbf', 'MPTF', 3777468906224825823),\n", + " (39, 'fbfc', 'FGFE', 1717429840091522704),\n", + " (42, 'ccbb', 'ENTS', 7934574478782340536),\n", + " (43, 'cbbc', 'NTSE', 13425603466974449662),\n", + " (44, 'bbcf', 'TSEF', 401197479395011469),\n", + " (48, 'dece', 'RLDM', 3514259266920402271),\n", + " (52, 'ccee', 'DELV', 14823416379075295469),\n", + " (53, 'ceeb', 'ELVP', 4220580037854917126),\n", + " (54, 'eebb', 'LVPG', 11038159954814081015),\n", + " (55, 'ebbf', 'VPGF', 3777468906224825823),\n", + " (59, 'ccdb', 'ENKS', 9515149052504803760),\n", + " (60, 'cdbd', 'NKSK', 627200266029661451),\n", + " (64, 'ebed', 'ITIK', 10110588346040050718),\n", + " (65, 'bede', 'TIKL', 8778380644404774298),\n", + " (70, 'bfcb', 'PFEA', 18351374587090038902),\n", + " (72, 'cbcb', 'EANP', 12503683351264325607),\n", + " (75, 'bceb', 'PELG', 15690868526339966161),\n", + " (76, 'cebd', 'ELGK', 6011042865859074878),\n", + " (77, 'ebdc', 'LGKD', 18271496327093918616),\n", + " (84, 'bbfe', 'SAYI', 6002404200797057331),\n", + " (87, 'edec', 'IKVE', 2606773150647828518),\n", + " (92, 'ebed', 'LALK', 10110588346040050718),\n", + " (93, 'bedb', 'ALKA', 13946213461756747690),\n", + " (94, 'edbc', 'LKAE', 578033708912463536),\n", + " (95, 'dbcb', 'KAEG', 7885097756694372784),\n", + " (98, 'bddb', 'GKKG', 7218742382433715284),\n", + " (105, 'dece', 'KIDV', 3514259266920402271),\n", + " (106, 'eceb', 'IDVG', 9509177043812954048),\n", + " (107, 'cebc', 'DVGD', 10454519246483044435),\n", + " (108, 'ebce', 'VGDI', 4015064277273641161),\n", + " (109, 'bceb', 'GDIT', 15690868526339966161),\n", + " (111, 'ebbc', 'ITAQ', 5727016641903521905),\n", + " (112, 'bbce', 'TAQI', 4231916625216208266),\n", + " (113, 'bcec', 'AQIN', 18103179462109539146),\n", + " (124, 'ebbe', 'ISTM', 6920393162873654046),\n", + " (125, 'bbeb', 'STMT', 10595816954080188374),\n", + " (126, 'bebc', 'TMTD', 18364677344105950525),\n", + " (136, 'befb', 'SLFS', 12608447144368280135),\n", + " (137, 'efbf', 'LFSF', 15041436218241961998),\n", + " (141, 'bbeb', 'APMT', 10595816954080188374),\n", + " (142, 'bebb', 'PMTG', 12348475939575077495),\n", + " (143, 'ebbf', 'MTGF', 3777468906224825823),\n", + " (146, 'fdbb', 'FKST', 17601202462413201014),\n", + " (149, 'bffb', 'TYYG', 493115665276685844),\n", + " (153, 'fbbc', 'FPSN', 15487073495104592163),\n", + " (154, 'bbcc', 'PSND', 8851499734005910348),\n", + " (155, 'bccd', 'SNDR', 5203889812942446964),\n", + " (156, 'ccdb', 'NDRA', 9515149052504803760),\n", + " (157, 'cdbe', 'DRAV', 7606399555099723201),\n", + " (158, 'dbed', 'RAVR', 110875883647815593),\n", + " (159, 'bedb', 'AVRG', 13946213461756747690),\n", + " (160, 'edbb', 'VRGT', 10559932379304644073),\n", + " (161, 'dbbe', 'RGTI', 14964367747587985732),\n", + " (162, 'bbee', 'GTIL', 15322539958464395896),\n", + " (163, 'beeb', 'TILA', 17475210263719431112),\n", + " (165, 'ebdb', 'LARG', 12564816384096126964),\n", + " (166, 'bdbb', 'ARGT', 14842221203416155138),\n", + " (167, 'dbbb', 'RGTS', 17726373358396768861),\n", + " (168, 'bbbd', 'GTSK', 10427112140463483717),\n", + " (170, 'bdce', 'SKNI', 17335298721270894545),\n", + " (173, 'ebbe', 'IGTI', 6920393162873654046),\n", + " (174, 'bbec', 'GTIQ', 4064199507453608151),\n", + " (176, 'eceb', 'IQLG', 9509177043812954048),\n", + " (183, 'bdec', 'PKLD', 2259711356554022352),\n", + " (184, 'dece', 'KLDL', 3514259266920402271),\n", + " (185, 'eceb', 'LDLT', 9509177043812954048),\n", + " (190, 'bebb', 'AIGG', 12348475939575077495),\n", + " (191, 'ebbb', 'IGGT', 6489788892270137876),\n", + " (192, 'bbbb', 'GGTG', 10636871753394754362),\n", + " (193, 'bbbb', 'GTGT', 10636871753394754362),\n", + " (194, 'bbbb', 'TGTG', 10636871753394754362),\n", + " (195, 'bbbc', 'GTGN', 10936565803043742378),\n", + " (205, 'dcbb', 'KDTP', 11732771003953160801),\n", + " (209, 'fcdb', 'YNKT', 8777467295491174931),\n", + " (212, 'bfcb', 'TYQG', 18351374587090038902),\n", + " (214, 'cbee', 'QGIL', 3021297321754295892),\n", + " (215, 'beef', 'GILY', 207159318435450593),\n", + " (216, 'eefb', 'ILYG', 12562983450975136318),\n", + " (217, 'efbe', 'LYGI', 6323445226374325209),\n", + " (219, 'becb', 'GIQA', 16513065919568703664),\n", + " (220, 'ecbb', 'IQAT', 1139172134765890002),\n", + " (225, 'dbed', 'KPIK', 110875883647815593),\n", + " (226, 'bedc', 'PIKN', 9924122599953948134),\n", + " (230, 'eecc', 'LLDQ', 8067965132495951993),\n", + " (231, 'eccc', 'LDQN', 18222561503088214725),\n", + " (236, 'cbdb', 'DTKS', 7294566287391320156),\n", + " (237, 'bdbe', 'TKSV', 11859816197542080665),\n", + " (238, 'dbee', 'KSVI', 6222755038673932181),\n", + " (239, 'beeb', 'SVIA', 17475210263719431112),\n", + " (242, 'bcbb', 'AETP', 13143451112721385487),\n", + " (248, 'ecfb', 'LNFG', 4312308046274198431),\n", + " (249, 'cfbe', 'NFGL', 6246859079940908248),\n", + " (251, 'bebb', 'GLSG', 12348475939575077495),\n", + " (252, 'ebbb', 'LSGA', 6489788892270137876),\n", + " (257, 'bccb', 'GNET', 2481900653225094471),\n", + " (262, 'ccbb', 'NNSS', 7934574478782340536),\n", + " (263, 'cbbe', 'NSSI', 16279479833324286872),\n", + " (264, 'bbeb', 'SSIT', 10595816954080188374),\n", + " (268, 'fbed', 'YSLK', 9666214415774811527),\n", + " (269, 'bedc', 'SLKD', 9924122599953948134),\n", + " (272, 'cdbe', 'DKSV', 7606399555099723201),\n", + " (273, 'dbee', 'KSVV', 6222755038673932181),\n", + " (274, 'beeb', 'SVVG', 17475210263719431112),\n", + " (278, 'ccee', 'NDLL', 14823416379075295469),\n", + " (279, 'ceeb', 'DLLS', 4220580037854917126),\n", + " (280, 'eebb', 'LLSP', 11038159954814081015),\n", + " (281, 'ebbb', 'LSPT', 6489788892270137876),\n", + " (282, 'bbbe', 'SPTL', 5844270199162658202),\n", + " (283, 'bbeb', 'PTLS', 10595816954080188374),\n", + " (284, 'bebc', 'TLSN', 18364677344105950525),\n", + " (286, 'bcbb', 'SNSA', 13143451112721385487),\n", + " (287, 'cbbe', 'NSAI', 16279479833324286872),\n", + " (288, 'bbee', 'SAIL', 15322539958464395896),\n", + " (289, 'beeb', 'AILA', 17475210263719431112),\n", + " (290, 'eebb', 'ILAS', 11038159954814081015),\n", + " (291, 'ebbf', 'LASF', 3777468906224825823),\n", + " (297, 'dfde', 'KYKL', 6306512417307503239),\n", + " (300, 'ebeb', 'LGLT', 13416682157814255285),\n", + " (301, 'bebd', 'GLTK', 16657858857931383438),\n", + " (303, 'bdec', 'TKIN', 2259711356554022352),\n", + " (305, 'eccd', 'INDK', 2362080192152468741),\n", + " (310, 'bfee', 'TYLI', 10718531886035662971),\n", + " (312, 'eeec', 'LILQ', 3327879637431277366),\n", + " (313, 'eece', 'ILQM', 3038027948003595164),\n", + " (314, 'eceb', 'LQMG', 9509177043812954048),\n", + " (316, 'ebbc', 'MGTD', 5727016641903521905),\n", + " (317, 'bbcf', 'GTDF', 401197479395011469),\n", + " (318, 'bcfb', 'TDFG', 2564856895474494016),\n", + " (319, 'cfbe', 'DFGI', 6246859079940908248),\n", + " (321, 'becb', 'GIDP', 16513065919568703664),\n", + " (322, 'ecbf', 'IDPF', 7824081998478575779),\n", + " (325, 'fbbc', 'FASD', 15487073495104592163),\n", + " (326, 'bbcf', 'ASDF', 401197479395011469),\n", + " (327, 'bcfb', 'SDFS', 2564856895474494016),\n", + " (328, 'cfbe', 'DFSI', 6246859079940908248),\n", + " (329, 'fbef', 'FSIF', 15409343862723143238),\n", + " (330, 'befb', 'SIFG', 12608447144368280135),\n", + " (335, 'ebdb', 'ISKA', 12564816384096126964),\n", + " (336, 'bdbb', 'SKAA', 14842221203416155138),\n", + " (337, 'dbbc', 'KAAN', 5358982605686529698),\n", + " (338, 'bbcf', 'AANF', 401197479395011469),\n", + " (343, 'dcbb', 'KETP', 11732771003953160801),\n", + " (344, 'cbbb', 'ETPS', 7151776787376566250),\n", + " (345, 'bbbc', 'TPSD', 10936565803043742378),\n", + " (353, 'bcef', 'AEIF', 3701223976780487860),\n", + " (357, 'cbcb', 'DPNG', 12503683351264325607),\n", + " (359, 'cbcb', 'NGNA', 12503683351264325607),\n", + " (360, 'bcbe', 'GNAL', 2582718285239245651),\n", + " (361, 'cbec', 'NALN', 12306494840529946604),\n", + " (362, 'becf', 'ALNF', 11450466598554388674),\n", + " (363, 'ecfb', 'LNFS', 4312308046274198431),\n", + " (366, 'bdcb', 'SKNT', 15142247617731513045),\n", + " (367, 'dcbc', 'KNTE', 13393529472101385437),\n", + " (369, 'bceb', 'TELG', 15690868526339966161),\n", + " (371, 'ebeb', 'LGIA', 13416682157814255285),\n", + " (375, 'fbbb', 'FSTG', 12785923314063975223),\n", + " (376, 'bbbb', 'STGA', 10636871753394754362),\n", + " (377, 'bbbb', 'TGAS', 10636871753394754362),\n", + " (378, 'bbbe', 'GASI', 5844270199162658202),\n", + " (379, 'bbeb', 'ASIG', 10595816954080188374),\n", + " (383, 'fbfc', 'FAWN', 1717429840091522704),\n", + " (387, 'dcbb', 'KDTG', 11732771003953160801),\n", + " (388, 'cbbc', 'DTGE', 13425603466974449662),\n", + " (395, 'fbed', 'WAIK', 9666214415774811527),\n", + " (396, 'bedb', 'AIKG', 13946213461756747690),\n", + " (397, 'edbb', 'IKGS', 10559932379304644073),\n", + " (398, 'dbbc', 'KGSD', 5358982605686529698),\n", + " (404, 'bbde', 'STRL', 13444773003568638791),\n", + " (409, 'bccc', 'GEQD', 1677271796314223088),\n", + " (414, 'dbbe', 'KSGV', 14964367747587985732),\n", + " (415, 'bbeb', 'SGVA', 10595816954080188374),\n", + " (416, 'bebe', 'GVAL', 9185312005446482276),\n", + " (417, 'ebeb', 'VALG', 13416682157814255285),\n", + " (418, 'bebe', 'ALGI', 9185312005446482276),\n", + " (419, 'ebeb', 'LGIS', 13416682157814255285),\n", + " (428, 'fdbd', 'YRSK', 7815796655124299643),\n", + " (429, 'dbdc', 'RSKD', 6008455418557613888),\n", + " (430, 'bdcb', 'SKDT', 15142247617731513045),\n", + " (431, 'dcbc', 'KDTE', 13393529472101385437),\n", + " (437, 'edbe', 'LKTI', 3379849564839441876),\n", + " (438, 'dbeb', 'KTIS', 2245165869925555499),\n", + " (439, 'bebc', 'TISE', 18364677344105950525),\n", + " (441, 'bccb', 'SENA', 2481900653225094471),\n", + " (444, 'bfcb', 'AFQS', 18351374587090038902),\n", + " (446, 'cbec', 'QSLN', 12306494840529946604),\n", + " (450, 'eceb', 'VEIS', 9509177043812954048),\n", + " (452, 'ebbf', 'ISSY', 3777468906224825823),\n", + " (459, 'ddbe', 'KKGI', 17379983601788100176),\n", + " (460, 'dbee', 'KGII', 6222755038673932181),\n", + " (461, 'beec', 'GIIN', 8654096532145016621),\n", + " (464, 'cbeb', 'NGLG', 17993562543865718127),\n", + " (466, 'ebfe', 'LGWI', 14933920156901041087),\n", + " (468, 'febb', 'WITS', 22856495822854259),\n", + " (469, 'ebbe', 'ITSI', 6920393162873654046),\n", + " (470, 'bbeb', 'TSIG', 10595816954080188374),\n", + " (471, 'bebe', 'SIGL', 9185312005446482276),\n", + " (476, 'ceed', 'DILR', 14964024876258418521),\n", + " (480, 'cdbe', 'QKSV', 7606399555099723201),\n", + " (481, 'dbec', 'KSVE', 15521531363153442454),\n", + " (482, 'becc', 'SVEN', 16775746449511419550),\n", + " (485, 'cfbb', 'NYPT', 10413764088404445244),\n", + " (486, 'fbbb', 'YPTT', 12785923314063975223),\n", + " (487, 'bbbe', 'PTTI', 5844270199162658202),\n", + " (488, 'bbeb', 'TTIS', 10595816954080188374),\n", + " (489, 'bebb', 'TISS', 12348475939575077495),\n", + " (490, 'ebbb', 'ISST', 6489788892270137876),\n", + " (491, 'bbbb', 'SSTT', 10636871753394754362),\n", + " (492, 'bbbc', 'STTE', 10936565803043742378),\n", + " (493, 'bbcc', 'TTEN', 8851499734005910348),\n", + " (494, 'bccc', 'TENN', 1677271796314223088),\n", + " (498, 'cbcc', 'QTEQ', 9144737173124411667),\n", + " (499, 'bccb', 'TEQS', 2481900653225094471),\n", + " (500, 'ccbb', 'EQSS', 7934574478782340536),\n", + " (501, 'cbbb', 'QSST', 7151776787376566250),\n", + " (502, 'bbbb', 'SSTS', 10636871753394754362),\n", + " (503, 'bbbb', 'STST', 10636871753394754362),\n", + " (504, 'bbbd', 'TSTK', 10427112140463483717),\n", + " (506, 'bdbb', 'TKTT', 14842221203416155138),\n", + " (507, 'dbbb', 'KTTT', 17726373358396768861),\n", + " (508, 'bbbb', 'TTTP', 10636871753394754362),\n", + " (509, 'bbbc', 'TTPN', 10936565803043742378),\n", + " (510, 'bbce', 'TPNL', 4231916625216208266),\n", + " (511, 'bceb', 'PNLT', 15690868526339966161),\n", + " (516, 'ccbe', 'EDAM', 17994569987617823751),\n", + " (518, 'bede', 'AMKL', 8778380644404774298),\n", + " (521, 'ebeb', 'LGLA', 13416682157814255285),\n", + " (522, 'bebe', 'GLAL', 9185312005446482276),\n", + " (527, 'ecfb', 'LDYA', 4312308046274198431),\n", + " (528, 'cfbe', 'DYAI', 6246859079940908248),\n", + " (530, 'bebe', 'AIPI', 9185312005446482276),\n", + " (531, 'ebeb', 'IPIA', 13416682157814255285),\n", + " (532, 'bebb', 'PIAS', 12348475939575077495),\n", + " (533, 'ebbe', 'IASI', 6920393162873654046),\n", + " (534, 'bbeb', 'ASIS', 10595816954080188374),\n", + " (535, 'bebb', 'SIST', 12348475939575077495),\n", + " (536, 'ebbc', 'ISTE', 5727016641903521905),\n", + " (540, 'bfee', 'AYVV', 10718531886035662971),\n", + " (541, 'feeb', 'YVVP', 3909355865613333976),\n", + " (542, 'eebf', 'VVPY', 4195867088387656031),\n", + " (543, 'ebfe', 'VPYI', 14933920156901041087),\n", + " (545, 'febb', 'YIGA', 22856495822854259),\n", + " (546, 'ebbf', 'IGAY', 3777468906224825823),\n", + " (547, 'bbfe', 'GAYI', 6002404200797057331),\n", + " (548, 'bfee', 'AYIL', 10718531886035662971),\n", + " (549, 'feeb', 'YILG', 3909355865613333976),\n", + " (550, 'eebb', 'ILGP', 11038159954814081015),\n", + " (551, 'ebbb', 'LGPS', 6489788892270137876),\n", + " (552, 'bbbc', 'GPSN', 10936565803043742378),\n", + " (557, 'ebbc', 'LSSD', 5727016641903521905),\n", + " (559, 'bcbb', 'SDAT', 13143451112721385487),\n", + " (560, 'cbbd', 'DATK', 15963831736868780672),\n", + " (561, 'bbde', 'ATKI', 13444773003568638791),\n", + " (564, 'efed', 'IYLK', 3007946504019006955),\n", + " (566, 'edbb', 'LKTG', 10559932379304644073),\n", + " (567, 'dbbe', 'KTGL', 14964367747587985732),\n", + " (568, 'bbeb', 'TGLS', 10595816954080188374),\n", + " (569, 'bebe', 'GLSL', 9185312005446482276),\n", + " (571, 'becd', 'SLEK', 6470706253879962510),\n", + " (575, 'eedf', 'LIRF', 974291162461474114),\n", + " (579, 'bbeb', 'TTIS', 10595816954080188374),\n", + " (580, 'bebe', 'TISL', 9185312005446482276),\n", + " (581, 'ebeb', 'ISLG', 13416682157814255285),\n", + " (584, 'bfcb', 'GWDS', 18351374587090038902),\n", + " (586, 'cbcc', 'DSNN', 9144737173124411667),\n", + " (588, 'ccee', 'NNII', 14823416379075295469),\n", + " (590, 'eece', 'IIEL', 3038027948003595164),\n", + " (591, 'eceb', 'IELA', 9509177043812954048),\n", + " (592, 'cebc', 'ELAN', 10454519246483044435),\n", + " (596, 'dcbc', 'KNTN', 13393529472101385437),\n", + " (597, 'cbcc', 'NTNN', 9144737173124411667),\n", + " (598, 'bccb', 'TNNA', 2481900653225094471),\n", + " (599, 'ccbb', 'NNAA', 7934574478782340536),\n", + " (600, 'cbbe', 'NAAI', 16279479833324286872),\n", + " (601, 'bbeb', 'AAIG', 10595816954080188374),\n", + " (602, 'bebb', 'AIGS', 12348475939575077495),\n", + " (603, 'ebbb', 'IGSA', 6489788892270137876),\n", + " (605, 'bbfe', 'SAFL', 6002404200797057331),\n", + " (614, 'fbbb', 'YSGS', 12785923314063975223)]" + ] + }, + "execution_count": 172, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "self = cd47_sigseq\n", + "other = p66_sigseq\n", + "overlap = self.get_overlapping_kmers(other)\n", + "overlap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1d01285", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 173, + "id": "27ae655c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ksize: 3\n", + "ksize: 4\n" + ] + } + ], + "source": [ + "from sigseq import SigSeq\n", + "\n", + "for ksize, sigs in p66_cd47_sigs.items():\n", + " print(f\"ksize: {ksize}\")\n", + "\n", + " p66_sigseq = SigSeq(sigs[\"p66\"], H7C7N8_BORBU)\n", + " cd47_sigseq = SigSeq(sigs[\"cd47\"], CD47_HUMAN)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "id": "adfc75d4", + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Non-sequential indices -- Previous: 0, current: 2", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[174], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mp66_sigseq\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdisplay_alignment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcd47_sigseq\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:244\u001b[0m, in \u001b[0;36mSigSeq.display_alignment\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 242\u001b[0m \u001b[38;5;124;03m\"\"\"Displays the alignment between two sequences\"\"\"\u001b[39;00m\n\u001b[0;32m 243\u001b[0m \u001b[38;5;66;03m# Compute overlaps\u001b[39;00m\n\u001b[1;32m--> 244\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_overlap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 245\u001b[0m other\u001b[38;5;241m.\u001b[39mcompute_overlap(\u001b[38;5;28mself\u001b[39m)\n\u001b[0;32m 247\u001b[0m \u001b[38;5;66;03m# Verify overlaps\u001b[39;00m\n", + "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:190\u001b[0m, in \u001b[0;36mSigSeq.compute_overlap\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;124;03m\"\"\"Computes overlap information between two sequences\"\"\"\u001b[39;00m\n\u001b[0;32m 189\u001b[0m overlap \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_overlapping_kmers(other)\n\u001b[1;32m--> 190\u001b[0m overlap_seq \u001b[38;5;241m=\u001b[39m \u001b[43mKmerStitcher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstitch_kmers\u001b[49m\u001b[43m(\u001b[49m\u001b[43moverlap\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_encoded\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m 191\u001b[0m overlap_encoded \u001b[38;5;241m=\u001b[39m KmerStitcher\u001b[38;5;241m.\u001b[39mstitch_kmers(overlap, use_encoded\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 193\u001b[0m index \u001b[38;5;241m=\u001b[39m other\u001b[38;5;241m.\u001b[39mseq\u001b[38;5;241m.\u001b[39mindex(overlap_seq)\n", + "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:95\u001b[0m, in \u001b[0;36mKmerStitcher.stitch_kmers\u001b[1;34m(cls, overlap, use_encoded)\u001b[0m\n\u001b[0;32m 93\u001b[0m prev_i \u001b[38;5;241m=\u001b[39m i\n\u001b[0;32m 94\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m---> 95\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 96\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNon-sequential indices -- Previous: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mprev_i\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, current: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 99\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m stitched\n", + "\u001b[1;31mValueError\u001b[0m: Non-sequential indices -- Previous: 0, current: 2" + ] + } + ], + "source": [ + "p66_sigseq.display_alignment(cd47_sigseq)" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "id": "32b99e31-5344-4529-acb6-8d8f1392db95", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:44:13.803198Z", + "iopub.status.busy": "2024-11-12T21:44:13.802750Z", + "iopub.status.idle": "2024-11-12T21:44:13.805865Z", + "shell.execute_reply": "2024-11-12T21:44:13.805585Z", + "shell.execute_reply.started": "2024-11-12T21:44:13.803185Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'edbdeefdeeefebbbbbefbbcbedcdcefdecbfebbfbfccbbcfdececceebbfccdbdebededbfcbcbcebdccbfbbfedeccebedbcbddbccfdecebcebbcecefcffedebbebcfcfcdcbefbfbbebbfdbbffbfbbccdbedbbeebdbbbdcebbecebfdebdecebfbebbbbbbcdcccccdcbbfcdbfcbeefbecbbfdbedceecccccbdbeebcbbfcecfbebbbfbccbfccbbebfbedcdbeebcceebbbebcbbeebbfbbdfdebebdeccdcbfeeecebbcfbecbfbbcfbefbdebdbbcfddcbbbcbcddbcefcbcbcbecfbdcbcebebfbbbbbebfbfcdcbbcdcbfbedbbcbfbbdefbcccddbbebebebfbccefdbdcbcddedbebccbfcbececebbfcccddbeecbebfebbebefceedcdbeccfbbbebbbbccccbccbbbbbdbbbbcebfccbedebebefecfbebebbebbcbfeebfebbfeebbbcdebbcbbdefedbbebecdeedfbbebebfcbcceecebcdcbccbbebbbfecfdebfbbb'" + ] + }, + "execution_count": 175, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p66_sigseq.seq_encoded" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "id": "69305af8-39ea-49b1-a307-b49e8d8d31a7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:49:30.134639Z", + "iopub.status.busy": "2024-11-12T21:49:30.134314Z", + "iopub.status.idle": "2024-11-12T21:49:30.137414Z", + "shell.execute_reply": "2024-11-12T21:49:30.137124Z", + "shell.execute_reply.started": "2024-11-12T21:49:30.134625Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'MKSHILYKLIIFLTTSAAIFAADALKEKDIFKINPWMPTFGFENTSEFRLDMDELVPGFENKSKITIKLKPFEANPELGKDDPFSAYIKVEDLALKAEGKKGDQFKIDVGDITAQINMYDFFIKISTMTDFDFNKESLFSFAPMTGFKSTYYGFPSNDRAVRGTILARGTSKNIGTIQLGYKLPKLDLTFAIGGTGTGNRNQENDKDTPYNKTYQGILYGIQATWKPIKNLLDQNEDTKSVIAETPFELNFGLSGAYGNETFNNSSITYSLKDKSVVGNDLLSPTLSNSAILASFGAKYKLGLTKINDKNTYLILQMGTDFGIDPFASDFSIFGHISKAANFKKETPSDPNKKAEIFDPNGNALNFSKNTELGIAFSTGASIGFAWNKDTGEKESWAIKGSDSYSTRLFGEQDKKSGVALGISYGQNLYRSKDTEKRLKTISENAFQSLNVEISSYEDNKKGIINGLGWITSIGLYDILRQKSVENYPTTISSTTENNQTEQSSTSTKTTTPNLTFEDAMKLGLALYLDYAIPIASISTEAYVVPYIGAYILGPSNKLSSDATKIYLKTGLSLEKLIRFTTISLGWDSNNIIELANKNTNNAAIGSAFLQFKIAYSGS'" + ] + }, + "execution_count": 176, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p66_sigseq.seq" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "id": "645c4216-7ee5-4962-b06c-b37766854596", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:47:28.023690Z", + "iopub.status.busy": "2024-11-12T21:47:28.023361Z", + "iopub.status.idle": "2024-11-12T21:47:28.026602Z", + "shell.execute_reply": "2024-11-12T21:47:28.026319Z", + "shell.execute_reply.started": "2024-11-12T21:47:28.023677Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 177, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p66_sigseq.sig.minhash.ksize" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "id": "63267a8a-09c4-4678-8e62-ab32e8e73d54", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:47:29.130940Z", + "iopub.status.busy": "2024-11-12T21:47:29.130609Z", + "iopub.status.idle": "2024-11-12T21:47:29.133676Z", + "shell.execute_reply": "2024-11-12T21:47:29.133391Z", + "shell.execute_reply.started": "2024-11-12T21:47:29.130926Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'efbeebbeeebbbaabbbceefcdbdbecfbfaccbeeebafebcecbccbbcefedfdfdbdcefbfcbbecdbbebbcfbbbdecebceedbcbbedecdbcbebdbbcfbacebcebdcbcbeecedfdeebffbbccceeeeefbefbeeeffbcfbedbedfdbbbeccdbebeeebbeeebeeeeebbeefebbcfbedcbbbebeeebbbbeeeeedffefbbbebebbfeebeeeeceebfeebeebebeaebbaebedbbeeebbebeebebceebeefedfebbccdbecbbddbeccbecbfdcbdbeeccc'" + ] + }, + "execution_count": 178, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cd47_sigseq.seq_encoded" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "id": "6231db67-2862-45b2-b4eb-a29dbb6ad13e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:49:34.422710Z", + "iopub.status.busy": "2024-11-12T21:49:34.422392Z", + "iopub.status.idle": "2024-11-12T21:49:34.425372Z", + "shell.execute_reply": "2024-11-12T21:49:34.425098Z", + "shell.execute_reply.started": "2024-11-12T21:49:34.422696Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'MWPLVAALLLGSACCGSAQLLFNKTKSVEFTFCNDTVVIPCFVTNMEAQNTTEVYVKWKFKGRDIYTFDGALNKSTVPTDFSSAKIEVSQLLKGDASLKMDKSDAVSHTGNYTCEVTELTREGETIIELKYRVVSWFSPNENILIVIFPIFAILLFWGQFGIKTLKYRSGGMDEKTIALLVAGLVITVIVIVGAILFVPGEYSLKNATGLGLIVTSTGILILLHYYVFSTAIGLTSFVIAILVIQVIAYILAVVGLSLCIAACIPMHGPLLISGLSILALAQLLGLVYMKFVASNQKTIQPPRKAVEEPLNAFKESKGMMNDE'" + ] + }, + "execution_count": 179, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cd47_sigseq.seq" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "id": "9cc02e40-6770-4a2c-9305-1a5553982b03", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:48:18.872968Z", + "iopub.status.busy": "2024-11-12T21:48:18.872639Z", + "iopub.status.idle": "2024-11-12T21:48:18.875221Z", + "shell.execute_reply": "2024-11-12T21:48:18.874842Z", + "shell.execute_reply.started": "2024-11-12T21:48:18.872954Z" + } + }, + "outputs": [], + "source": [ + "columns = [\"i\", \"dayhoff\", \"protein\", \"hashval\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "id": "a59d434e-77b3-47c6-ab2a-0224dd0fbfb6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:48:19.021119Z", + "iopub.status.busy": "2024-11-12T21:48:19.020791Z", + "iopub.status.idle": "2024-11-12T21:48:19.042976Z", + "shell.execute_reply": "2024-11-12T21:48:19.042669Z", + "shell.execute_reply.started": "2024-11-12T21:48:19.021105Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(295, 4)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idayhoffproteinhashval
06fdeeYKLI10068566436590771491
18eeefLIIF224116582751336810
29eefeIIFL7860449462917742889
310efebIFLT4466143379625637480
411febbFLTT22856495822854259
\n", + "
" + ], + "text/plain": [ + " i dayhoff protein hashval\n", + "0 6 fdee YKLI 10068566436590771491\n", + "1 8 eeef LIIF 224116582751336810\n", + "2 9 eefe IIFL 7860449462917742889\n", + "3 10 efeb IFLT 4466143379625637480\n", + "4 11 febb FLTT 22856495822854259" + ] + }, + "execution_count": 181, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p66_in_cd47_overlap = pd.DataFrame(\n", + " cd47_sigseq.get_overlapping_kmers(p66_sigseq), columns=columns\n", + ")\n", + "print(p66_in_cd47_overlap.shape)\n", + "p66_in_cd47_overlap.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "id": "491e094e-ddfa-40ea-bc95-8400bf59c122", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:48:39.916599Z", + "iopub.status.busy": "2024-11-12T21:48:39.916278Z", + "iopub.status.idle": "2024-11-12T21:48:39.920665Z", + "shell.execute_reply": "2024-11-12T21:48:39.920373Z", + "shell.execute_reply.started": "2024-11-12T21:48:39.916585Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dayhoff\n", + "bbeb 12\n", + "bbbb 11\n", + "ebeb 7\n", + "bebb 7\n", + "bebe 7\n", + " ..\n", + "eccc 1\n", + "cbdb 1\n", + "bdbe 1\n", + "dfde 1\n", + "eedf 1\n", + "Name: count, Length: 127, dtype: int64" + ] + }, + "execution_count": 182, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p66_in_cd47_overlap.dayhoff.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "id": "4fe2d889-119c-48ca-bd00-542f9138c833", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:48:19.197833Z", + "iopub.status.busy": "2024-11-12T21:48:19.197509Z", + "iopub.status.idle": "2024-11-12T21:48:19.209901Z", + "shell.execute_reply": "2024-11-12T21:48:19.209603Z", + "shell.execute_reply.started": "2024-11-12T21:48:19.197819Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(199, 4)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idayhoffproteinhashval
00efbeMWPL6323445226374325209
12beebPLVA17475210263719431112
23eebbLVAA11038159954814081015
34ebbeVAAL6920393162873654046
45bbeeAALL15322539958464395896
\n", + "
" + ], + "text/plain": [ + " i dayhoff protein hashval\n", + "0 0 efbe MWPL 6323445226374325209\n", + "1 2 beeb PLVA 17475210263719431112\n", + "2 3 eebb LVAA 11038159954814081015\n", + "3 4 ebbe VAAL 6920393162873654046\n", + "4 5 bbee AALL 15322539958464395896" + ] + }, + "execution_count": 183, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cd47_in_p66_overlap = pd.DataFrame(\n", + " p66_sigseq.get_overlapping_kmers(cd47_sigseq), columns=columns\n", + ")\n", + "print(cd47_in_p66_overlap.shape)\n", + "cd47_in_p66_overlap.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "id": "0b40f3d1-28b7-436f-882d-66d8312ac6d8", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:48:33.116250Z", + "iopub.status.busy": "2024-11-12T21:48:33.115922Z", + "iopub.status.idle": "2024-11-12T21:48:33.120505Z", + "shell.execute_reply": "2024-11-12T21:48:33.120208Z", + "shell.execute_reply.started": "2024-11-12T21:48:33.116237Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dayhoff\n", + "eebb 6\n", + "bbee 5\n", + "bebe 5\n", + "bcfb 4\n", + "ebce 4\n", + " ..\n", + "bede 1\n", + "bcbb 1\n", + "edbc 1\n", + "ceed 1\n", + "eccc 1\n", + "Name: count, Length: 127, dtype: int64" + ] + }, + "execution_count": 184, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cd47_in_p66_overlap.dayhoff.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "id": "9d888649-eb01-452d-ba52-825a119cdc3b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T21:52:02.519716Z", + "iopub.status.busy": "2024-11-12T21:52:02.519502Z", + "iopub.status.idle": "2024-11-12T21:52:02.526996Z", + "shell.execute_reply": "2024-11-12T21:52:02.526710Z", + "shell.execute_reply.started": "2024-11-12T21:52:02.519703Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
i_xdayhoffprotein_xhashvali_yprotein_y
06fdeeYKLI10068566436590771491130YRVV
18eeefLIIF224116582751336810144IVIF
28eeefLIIF224116582751336810152ILLF
39eefeIIFL7860449462917742889194ILFV
49eefeIIFL7860449462917742889285LVYM
.....................
529603ebbbIGSA64897888922701378769LGSA
530603ebbbIGSA6489788892270137876213VTST
531605bbfeSAFL6002404200797057331234TSFV
532614fbbbYSGS1278592331406397522380FSSA
533614fbbbYSGS12785923314063975223227FSTA
\n", + "

534 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " i_x dayhoff protein_x hashval i_y protein_y\n", + "0 6 fdee YKLI 10068566436590771491 130 YRVV\n", + "1 8 eeef LIIF 224116582751336810 144 IVIF\n", + "2 8 eeef LIIF 224116582751336810 152 ILLF\n", + "3 9 eefe IIFL 7860449462917742889 194 ILFV\n", + "4 9 eefe IIFL 7860449462917742889 285 LVYM\n", + ".. ... ... ... ... ... ...\n", + "529 603 ebbb IGSA 6489788892270137876 9 LGSA\n", + "530 603 ebbb IGSA 6489788892270137876 213 VTST\n", + "531 605 bbfe SAFL 6002404200797057331 234 TSFV\n", + "532 614 fbbb YSGS 12785923314063975223 80 FSSA\n", + "533 614 fbbb YSGS 12785923314063975223 227 FSTA\n", + "\n", + "[534 rows x 6 columns]" + ] + }, + "execution_count": 185, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p66_in_cd47_overlap.merge(cd47_in_p66_overlap, on=[\"dayhoff\", \"hashval\"])" + ] + }, + { + "cell_type": "markdown", + "id": "c6f2af40-562a-4bf6-98b1-b1c8f9e83105", + "metadata": {}, + "source": [ + "### Where is jaccard > 0.5?" + ] + }, + { + "cell_type": "markdown", + "id": "c03d1328", + "metadata": {}, + "source": [ + "# This value was dropped from jaccard > 0.7 in the hp dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "id": "2af67f6f-51f5-4ed1-97ac-b0bf4f03a76d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T19:00:34.689250Z", + "iopub.status.busy": "2024-11-12T19:00:34.689105Z", + "iopub.status.idle": "2024-11-12T19:00:34.754032Z", + "shell.execute_reply": "2024-11-12T19:00:34.753551Z", + "shell.execute_reply.started": "2024-11-12T19:00:34.689238Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
querymatchmoltypeksizejaccardquery_n_kmersquery_n_unique_kmersquery_intersection_positionsmatch_n_kmersmatch_n_unique_kmersmatch_intersection_positions
0CD47p66dayhoff50.098266319269[[2, beebb], [8, eebbb], [15, bbbce], [26, bec...614491[[9, eefeb], [10, efebb], [12, ebbbb], [14, bb...
1CD47p66dayhoff60.026838318294[[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83...613586[[9, eefebb], [75, bcebdc], [105, decebc], [10...
2CD47p66dayhoff70.006579317306[[84, decebce], [105, ebdbbcf], [197, ebbcfbe]...612612[[105, decebce], [154, bbccdbe], [159, bedbbee...
\n", + "
" + ], + "text/plain": [ + " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n", + "0 CD47 p66 dayhoff 5 0.098266 319 269 \n", + "1 CD47 p66 dayhoff 6 0.026838 318 294 \n", + "2 CD47 p66 dayhoff 7 0.006579 317 306 \n", + "\n", + " query_intersection_positions match_n_kmers \\\n", + "0 [[2, beebb], [8, eebbb], [15, bbbce], [26, bec... 614 \n", + "1 [[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83... 613 \n", + "2 [[84, decebce], [105, ebdbbcf], [197, ebbcfbe]... 612 \n", + "\n", + " match_n_unique_kmers match_intersection_positions \n", + "0 491 [[9, eefeb], [10, efebb], [12, ebbbb], [14, bb... \n", + "1 586 [[9, eefebb], [75, bcebdc], [105, decebc], [10... \n", + "2 612 [[105, decebce], [154, bbccdbe], [159, bedbbee... " + ] + }, + "execution_count": 191, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p66_cd47_df.query(\"jaccard > 0.001\")" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "id": "894951f6-8e71-4f7d-8ac6-6e75b3472427", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T19:00:34.754744Z", + "iopub.status.busy": "2024-11-12T19:00:34.754593Z", + "iopub.status.idle": "2024-11-12T19:00:34.862007Z", + "shell.execute_reply": "2024-11-12T19:00:34.861660Z", + "shell.execute_reply.started": "2024-11-12T19:00:34.754732Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 192, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAANAAAADQCAYAAAB2pO90AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAPXUlEQVR4nO3dfZBddX3H8fdnQ2BpEiCkSYAiDRkQJAMNdQVqQREq3XYqVYxAKSP2YYAZmNCJnYq1AxFrp4+hFGhHpkDjA9XYWk1HBGlKxWqKWSBCl6iEGGhsyBOBPNiFTfbbP85v15vlZnP2/vbs3Xv385q5s3vPw93vmdnP3HPP/Z3fVxGBmTWmo9kFmLUyB8gsgwNklsEBMsvgAJllaJsAdXd3B+CHH1U96mqbAG3fvr3ZJdgkVGmAJHVL+r6k9ZJurrP+HZKelLRP0qJh666R9Fx6XFNlnWaNqixAkqYAdwO/ApwB/IakM4Zt9iLwIeCBYfseC9wKnAucA9wqaWZVtZo1qsp3oHOA9RGxISJeBz4P/HrtBhGxMSKeBgaG7fvLwCMR8XJE7AQeAborrNWsIVUG6GeA/6l5viktG7N9JV0rqUdSz7Zt297wIgMDwYZte1j9/HY2bNvDwMBBPwuaNeSwZheQIyLuAe4B6OrqOiAdAwPBQ70vsWTFWvr6B+ic2sGyyxfSveA4OjrUlHqt/VT5DvQj4E01z09My6reF4CNO/YOhQegr3+AJSvWsnHH3tG8jNmIqgzQGuBUSSdLOhy4ElhZct+HgUskzUwXDy5Jy0rbsqtvKDyD+voH2Lq7bzQvYzaiygIUEfuAGyn+8dcBKyKiV9Jtki4FkPQ2SZuADwCfktSb9n0Z+ARFCNcAt6Vlpc09qpPOqQceXufUDubM6Mw8MrOfULvcD9TV1RU9PT1Dz/0ZyMZY3X+alr6IMJKODtG94DhOX3wBW3f3MWdGJ/NmTXN4bEy1bYCgCNH82dOZP3t6s0uxNtU2Y+HMmsEBMsvgAJllcIDMMjhAZhkcILMMDpBZBgfILIMDZJbBATLL4ACZZXCAzDI4QGYZHCCzDA6QWQYHyCyDA2SWwQEyy+AAmWVodneGIyR9Ia1/XNK8tHyqpOWSnpG0TtJHq6zTrFHN7s7wO8DOiDgFuB34s7T8A8AREXEm8FbgusFwmU0kTe3OkJ4vT7//E3CxJFF0BJsm6TDgSOB1YFeFtZo1pNndGYa2STOZvgrMogjTXmAzRQ+hv6w3M+mhujOYVW2iXkQ4B9gPnACcDHxY0vzhG0XEPRHRFRFds2fPHu8azZrenWFom3S6djSwA7gKeCgi+iNiK/AtoKvCWs0a0uzuDCuBwf6ni4B/j2Ky7heBiwAkTQPOA75XYa1mDWlqdwbgXmCWpPXAEmDwUvfdwPTUrWENcH9qBWk2obRtdwazMVa3K8FEvYhg1hIcILMMDpBZBgfILIMDZJbBATLL4ACZZXCAzDI4QGYZHCCzDA6QWQYHyCyDA2SWwQEyy+AAmWVwgMwyOEBmGRwgswwOkFkGB8gsgwNklmFCdmdI686StFpSb+rS0FllrWaNmJDdGdIspZ8Fro+IBcCFQH9VtZo1aqJ2Z7gEeDoivgsQETsiYn+FtZo1ZKJ2Z3gzEJIelvSkpD+osE6zhh12sBWS7qTo01NXRCyupKLCYcD5wNuAHwOrJD0REauG1XgtcC3ASSedVGE5ZvWN9A7UAzwBdAI/DzyXHguBw0u8dk53hk3AYxGxPSJ+DDyYajiA25tYsx00QBGxPCKWA2cBF0bEnRFxJ3AxRYgOJac7w8PAmZJ+KgXrncCzozgus3Fx0FO4GjOBo4DBDnHT07IRRcQ+SYPdGaYA9w12ZwB6ImIlRXeGz6TuDC9ThIyI2ClpGUUIA3gwIr46ukMzq94huzNI+i1gKfAoxQz17wCWpnenCcPdGaxidbszjPgOJKkD+D5wbnoAfCQiXhrb2sxa04gBiogBSXdHxNnAV8apJrOWUeZ7oFWS3p++4DSzGmUCdB3wReA1Sbsk7Za0q+K6zFrCIa/CRcSM8SjErBWVuYyNpJnAqRRfqgIQEY9VVZRZqzhkgCT9LnATxUiCtRQt51eT2tCbTWZlPgPdRDEm7YWIeBdwNvBKlUWZtYoyAeqLiD4oboCLiO8Bp1VblllrKPMZaJOkY4AvA49I2gm8UGVRZq2izFW496Vfl0p6lGLE9EOVVmXWIg55CifpPEkzACLiG8B/UHwOMpv0ynwG+jtgT83zPWmZ2aRXJkCKmiHbETFAye+PzNpdmQBtkLRY0tT0uAnYUHVhZq2gTICuB95Ocfv1JorbGq6tsiizVlHmKtxW0p2iZnagMlfhlqfvgQafz5R0X6VVmbWIMqdwZ0XEK4NPImInvoxtBpQLUEcajQ2ApGPxVTgzoFwQ/gpYLemLFBMrLAI+WWlVZi2izEWET0t6AnhXWnRZRHiONjNKzo0dEb3ACoqJEPdIKjWPbk57k7T+JEl7JP1+mb9nNt7KXIW7VNJzwA+BbwAbga+V2K/h9iY1lpX5W2bNUuYd6BMUd6H+ICJOppja979K7JfT3gRJ76UIbW+Jv2XWFGUC1B8ROyiuxnVExKNAV4n9Gm5vImk68BHg4yP9AUnXSuqR1LNt27YSJZmNrTIBeiX9Q38T+JykO4C91ZbFUuD2iNgz0kbuzmDNVuYy9qVAH8XcCFdTTDQ/4jtDMpr2JpuGtTc5F1gk6c+BY4ABSX0RcVeJv2s2bkZqsPWfEXE+sIWfNNoanJ30jyW9DPxFRPztQV5iqL0JRVCuBK4ats1ge5PVHNje5IKaOpYCexwem4gOGqAUnoNOrChpFvBtoG6ActqbmLWKQ7Y3GXFn6fiI2DyG9TTM7U2sYnXnhs9qMjxRwmPWLFV26TZrew6QWQYHyCyDA2SWwQEyy+AAmWVwgMwyOEBmGRwgswwOkFmGSTk91cBAsHHHXrbs6mPuUZ3MmzWNjo66Q53MRjTpAjQwEDzU+xJLVqylr3+AzqkdLLt8Id0LjnOIbNQm3Sncxh17h8ID0Nc/wJIVa9m4o+qbbK0dTboAbdnVNxSeQX39A2zd3dekiqyVTboAzT2qk86pBx5259QO5szobFJF1somXYDmzZrGsssXDoVo8DPQvFnTmlyZtaJJdxGho0N0LziO0xdfwNbdfcyZ4atw1rhJFyAoQjR/9nTmz57e7FKsxU26UzizseQAmWWoNECNdmeQ9G5JT0h6Jv28qMo6zRpVWYAyuzNsB94TEWdSTLz4marqNMtR5TtQw90ZIuKpiPjftLwXOFLSERXWataQKgPUcHeGYdu8H3gyIl4b/gfcncGabUJfRJC0gOK07rp6692dwZqtygCNpjsDw7ozIOlE4F+AD0bE8xXWadawKgM01J1B0uEUE8evHLbNYHcGqOnOIOkY4KvAzRHxrQprNMtSWYDSZ5rB7gzrgBWD3RkkXZo2u5eiI916YAkweKn7RuAU4BZJa9NjTlW1mjUqqzvDROLuDFaxse/OYDbZOUBmGRwgswwOkFkGB8gsgwNklsEBMsvgAJllcIDMMjhAZhkcILMMk3Jaq3rcscEa4QDhjg3WOJ/C4Y4N1jgHCHdssMY5QLhjgzXOAcIdG6xxvoiAOzZY4xygxB0brBE+hTPL4ACZZaj0FE5SN3AHMAX4+4j402HrjwA+DbyVYkLFKyJiY1r3UYrJ5/cDiyPi4SprPZh6IxSAuqMWvG37bFv2829lAarpzvBuinmx10haGRHP1mw21J1B0pUU0/hekbo4XAksAE4A/k3SmyNif1X11lNvhMJdV53N6/viDaMWLnnLXL6+bou3bYNtRzMKZUJ2Z0jLPx8Rr0XED4H16fXGVb0RCk9verXuqIXezW9c7m1bc9vRjEKZqN0ZyuxbeXeGeiMUBoK6oxY2v+pt22Xb0YxCaemLCFV3Z6g3QmGKqDtq4fijj/S2bbLtaEahTNTuDGX2rVy9EQpnnnh03VELC44/ytu2ybajGYVS2dzYKRA/AC6m+OdfA1wVEb0129wAnBkR16eLCJdFxOWpL9ADFJ97TgBWAaeOdBGhqrmxB6/Q1I5QAN6wrPZqjrdt/W3rXECoe0Wh0snlJf0q8NcUl7Hvi4hPSroN6ImIlZI6Kfqfng28DFwZERvSvh8DfhvYB/xeRHxtpL/lyeWtYuMfoPHkAFnF3J3BbKw5QGYZ2uYUTtI24IVm15H8NLC92UVUZLIe2/aI6B6+sG0CNJFI6omIrmbXUQUf24F8CmeWwQEyy+AAVeOeZhdQIR9bDX8GMsvgdyCzDA6QWQYHKJOk+yRtlfTfNcuOlfSIpOfSz5nNrLERkt4k6VFJz0rqlXRTWt4Ox9Yp6TuSvpuO7eNp+cmSHpe0XtIXJB1+qNdygPL9AzD8C7abgVURcSrFSPKbx7uoMbAP+HBEnAGcB9yQbrVvh2N7DbgoIn4OWAh0SzqPYkqB2yPiFGAnxZQDI3KAMkXEYxQjyWvV3qq+HHjveNY0FiJic0Q8mX7fDayjuCu4HY4tImJPejo1PQK4iGJqASh5bA5QNeZGxOb0+0vA3GYWk0vSPIpbTh6nTY5N0hRJa4GtwCPA88AraWoBOMg0AsM5QBWL4nuClv2uQNJ04J8p7snaVbuulY8tIvZHxEKKu53PAU5v5HUcoGpskXQ8QPq5tcn1NETSVIrwfC4ivpQWt8WxDYqIV4BHgV8Ajkl3UkPJaQQcoGqsBK5Jv18DfKWJtTQkTS92L7AuIpbVrGqHY5st6Zj0+5EUcxeuowjSorRZqWPzSIRMkv4RuJBiKPwW4Fbgy8AK4CSKWywuj4jhFxomNEnnA98EngEG5336Q4rPQa1+bGdRXCSYQvEmsiIibpM0n2L+wmOBp4CrI+K1EV/LATJrnE/hzDI4QGYZHCCzDA6QWQYHyCyDA9QmJM2rHRF+iG2/XXU9k4UDNAlFxNubXUO7cIDakKT5kp6S9M5038taSU9LOjWt35N+3pbWrZX0I0n3p+VX1+z3qdRt0OpwgNqMpNMoxq99iGJYyh1p0GQXxQjjIRFxS1p3IcUtGXdJegtwBfCLad1+4DfHp/rWU2mTYRt3synGb10WEc9KWg18TNKJwJci4rnhO6Qxb58FlkXEE5JupGj6vKZYxZG0+IDRKvkdqL28CrwInA8QEQ8AlwL/Bzwo6aI6+ywFNkXE/em5gOURsTA9TouIpZVX3qIcoPbyOvA+4IOSrkqDIzdExN9QvDOdVbuxpPcAvwQsrlm8ClgkaU7a5lhJPzsu1bcgn8K1mYjYK+nXKO6y/FfgCkn9FHeP/smwzZdQ3HX5nXS6tjIibpH0R8DXJXUA/cANTJyJ+ycUj8Y2y+BTOLMMDpBZBgfILIMDZJbBATLL4ACZZXCAzDL8P8f0QbsmO2CpAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.relplot(data=p66_cd47_df, x=\"ksize\", y=\"jaccard\", height=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "id": "5afd7f1c-d1e1-40c0-97b8-396fcd0d7c52", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T19:00:34.862568Z", + "iopub.status.busy": "2024-11-12T19:00:34.862436Z", + "iopub.status.idle": "2024-11-12T19:00:34.950627Z", + "shell.execute_reply": "2024-11-12T19:00:34.950263Z", + "shell.execute_reply.started": "2024-11-12T19:00:34.862557Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 193, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAANEAAADRCAYAAABSOlfvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWz0lEQVR4nO2de5RcVZWHv1+HlsaQAIYAEYhJEEQiTJAWEc0MomDEBw4IxAcIOuJCniK+QVFH14JBHBkdmQjKY/GKIBLezwDCACEkIS9QIDSKQkiiBBKmIaH3/HFOJZVKVXWdvnW7qrr2t9Zdde+559zaJ9U7Z59z99lbZobjOAOno9ECOE6r40rkOBlxJXKcjLgSOU5GXIkcJyMtr0RTpkwxwA8/6nEMiJZXouXLlzdaBKfNaXklcpxG40rkOBlxJXKcjGzSaAEcp5S+PqNnxWqWvtTLtiO7GDdqOB0darRYFXElcpqKvj7jlkXPc+r0efSu6aOrs4NzD5/ElInbNa0iuTnnNBU9K1avUyCA3jV9nDp9Hj0rVjdYssr4SOQ0jHJm29KXetcpUIHeNX0sfakXoClNPFcipyFUMtt2GzOCrs6ODRSpq7ODNa8bB533h6Y08dyccxpCJbPt9T449/BJdHWGP82uzg7OOnQPzrhuQdOaeD4SOYNCqem2YvWrZc22Zat6mTJxO3Y9aTIvvNzLNiNC3WdW/N9GdV94uZcJozcfzG6UxZXIyZ1ypttZh+7BW0ZttoFydHV2sM2ILjo6xITRm2+gIOVMvG1GdA1qPyrh5pyTO+VMt29cM58fHrz7BmbbuYdPYtyo4Ru1Hzdq+EYmXqW6jcBHIid3Kq24dQ4TNxWZbZVW3Do6tJGJ56tzzpCmdP4zZouusubYtiO7NjLbKlHOxGsWXImculJp6frnn96TEy6fu0FZs5hjWVGrh8zq7u622bNnN1oMJ7Jk2ap173MKdHV2cOOJk5FoSnOsiAEJ5CORM2BSPA6WreplnwlbN6U5lhVXImdApHocNMtydB74ErczIFI8DobS/KccuY5EkrqAe4FN43ddbWbfk3QCcAqwEzDazJbH+gJ+BhwEvAIcbWZz8pTRqY0sHgdNOv+pG3mbc68C+5vZKkmdwH2SbgbuB24A7i6p/2Fg53i8G/hl/HQaSD08DoYyuZpzFlgVLzvjYWY218x6yjQ5GLgktnsQ2FLSmDxldPonq8fBUCf3hQVJw4BHgLcCvzCzh6pU3x74S9H1s7HsuZJnHgscCzB27Ni6yuvUbrrV6nEw1MldiczsdWCSpC2BayW9w8wWZnzmNGAahPdE2aV0CqSYbikeB0OZQVudM7MXgZnAlCrV/grsWHS9QyxzBgk33dIZ0EgkqQPY3Mxe6qfeaGCNmb0oaTPgAOCsKk1mACdIupKwoLDSzJ6rUt+pM1mdRduRmkciSZdLGilpOLAQWCzpa/00GwPMlDQfeBi43cxukHSSpGcJI818SRfE+jcBS4AngV8BX07sj5NAX5+xZNkqHnhqOUuWraKvz9h2ZNe6EadAselW8DpwBVpPzb5zkuaZ2SRJnwHeCXwTeMTM9shTwP5w37mBUcnj4MC3b8ttjy1tqZBVdSR337nO+K7nE8DPzWyNJJ/UtyiVPA5uOmly270szUqKEp0P9ACPAvdKegtQdU7kNA+1LlsX4hb4qlvt1KREcSFhqZltX1T2Z+D9eQnm1I9UjwMnjZoWFsysD/h6SZmZ2dpcpHLqii9b50uKOXeHpNOAq4B1Ab/M7O91l8rJRLt4HDRL4PsUJToifh5fVGbAhPqJ42SlXTwOminwfc3vicxsfJnDFajJaBfTrZkC39c8Ekl6I3AqMNbMjpW0M/A2M7shN+mcqqRsz251062USv1sRFTUFHPuNwRv7H3j9V+B3xL2BTmDTOr27FY23cpR8Kxohm3oKQ6oO5nZ2cAaADN7hQG+4XWy0+7bs5spKmrKSPRadCI1AEk7EXauOoOAb8/ekGaKipqiRN8DbgF2lHQZ8F7g6DyEcjbEt2eXp1n6mbI6dztwCEFxrgC6zezufMRyimmXFbdWJXU/0fbAsNjunyVhZr+rv1hOMe2y4taqpCxx/xrYA1gEFH5RA1yJ6kweAeGd/EgZifYxs91yk8QB2jMgfKuTsinvQuAnZrY4X5HSGGqb8lo8IHyrk/umvEuAByQ9T1jaFsGZu6E7W1sZDwg/NEhRoguBI4EFrJ8TOQPEA8IPHVI8FpaZ2Qwze9rMnikcuUk2xGl3j4OhRMpINFfS5cD1FHkq+BJ3bbjHwdAlRYk2IyjPgUVlvsRdA+5xMLRJUaKvlu5ilTS+zvIMSSp5HEw7sptjL53ty9YtTooSXS/pw4Wop5LeTtgK8Y5KDarkJxoPXAmMImyvONLMXpO0KWEVcC9gBXBEhewRTU27bM92AilK9GOCIn0EeBvhj/0z/bSplJ/oVOCnZnalpPOBLxByEX0B+IeZvVXSVELI4SMqPbwZaZft2c56UhxQbwR+CtwGXAT8q5nN66dN2fxEwP7A1bH8YkJASAj5iS6O51cDH4jZ81oGdxZtP/odiST9F3EPUWQL4ClC4HnM7KR+2m+Qnyi2fbEo3FYhBxEU5Scys7WSVhJMvuUlz2za/ETuLNp+1GLOlfrUPJLyBaX5iYBdU9pXeGZT5Ccq53FQaduym25Dl36VyMwu7q8OgKRrzOzQKs95UdJM4D2ENJKbxNGoOAdRIT/Rs5I2IYx6K2r5/sGmWkD4cw+ftFG5m25Dl3pmytsofFaV/EQzgU8SVug+B1wXm8yI1w/E+3dZrR6yg4wHhHcK1FOJyv2xjwEujvOiDmB6zE+0GLhS0r8Dcwl+ecTPSyU9CfwdmFpH+TLhAeGdSuSas9XM5gN7lilfAuxdprwXOCxPmQaCB4R3qlHPnK1D1l7xZWunGkkjUZzXjDWzP5a5/Y36iNR43OPASSElxsLHgHOANwDjJU0CfmBmHwcws9tykXCQcY8DJ5UUc+5MwjzmRYDorTDkHFDddHNSSTHn1pjZyhIvnKZcfq6Vdg4I79SPFCVaJOnTwLCYEeIk4H/zESt/2j0gvFM/Usy5E4GJBM/sKwhJj0/JQaZBwbdnO/Wi5pEoZoH4TjxanmpRddzjwEkhZXVuJmXmQGa2f10lyolao4r69mwnlZQ50WlF513AoUBLZA/3qKJOntQcAbVsY2mWmW3kvjOY1BIB1aOKOjWSbwRUSW8quuwgxEHYYiBfmjcp4ak8qujQo9yrizz/c0wx5x4hzIlEMOOeJsREaCrcWbS9qWS6T5m4XW6KlBJjYbyZTYifO5vZgWZ2Xy5SZcA9DtqbSq8uelaszu07U8y5Q6rdb5ZIqO5x0N5U+v0L+7zyIMWc+wKwL3BXvH4/wWNhGU0UCdVjHLQ3lX7/PE33FI+FTmA3Mzs0xlKYCHSa2TFm9vl8xEunmVKzO4NPI37/lCRfj5nZ24uuO4BFxWWNoNwSd2F1xk239iTD7597kq87Jd1K8JuDEJn0joF8ad64x0F7M9i/f4rv3AlxcWFyLJpmZtfmI5bjtA5J28PjClxTLCA4TrNQSxjh+8zsfZJeZkMH1ELO1pG5Sec4LUAtEVDfFz9H5C+O47QeSSGzJA2T9GZJYwtHP/V3lDRT0mJJiySdHMv/SdIDkhZIul7SyKI235L0pKQ/SvrQwLrlOINHisfCicD3gKWszx5uwB5Vmq0lZNibI2kE8Iik24ELgNPM7B5Jnwe+BpwhaTdC1NOJwJuBOyTtEoPiO05TkrKwcDLwNjOrOcC8mT0HPBfPX5b0GCF9yi6EDHoAtwO3AmcQ8hNdaWavAk/HcMJ7E2JzO05TkmLO/QVYOdAvkjSOEFL4IWARQWEghA3eMZ6vy08UKc5d5DhNScpItAS4W9KNhGAlAJjZuf01lLQ5cA1wipm9FE248ySdQcgE8VqK0M2c5MtpP1KU6M/xeEM8aiLmar0GuKzg6W1mjwMHxvu7AB+J1Qv5iQoU5y5aR7Mk+XIcSPNY+H7qw2O+1QuBx4pHLEnbmNkL0f/udOD8eGsGcLmkcwkLCzsDs1K/13EGk7yj/bwXOBJYIGleLPs2sLOk4+P174DfxGctkjQdWExY2TveV+acZifFi3uvost10X7M7Ot5CFYrtQQqcZwaydeL28xKEx7fL8lNLaftGZLRfhxnMBly0X4cZ7BJMeeq5iKSdICZ3Z5dJMdpLeqZs/WsOj7LcVoGT3zsOBmppxK554DTltRTiRynLamnEvXU8VmO0zIkBSqRtC8wrridmV0SP6uGGXacoUrKy9ZLgZ2AeUDBn82AS+ovluO0DikjUTchjLAvIDhOESlzooXAdnkJ4jitSspItDWwODqdFu9s/XjdpXKcFiJFic7MSwjHaWVSfOfuqXZf0gNm9p7sIjlOa1HP90SeANVpS9ztx3Ey4m4/jpMR9+J2nIzUrESSTpS0VZUqR9ZBHsdpOVJGom2BhyVNlzQlxpRbh5ktrK9ojtMa1KxEZnY6IZjihcDRwBOSfixpp5xkc5yWIGlOFP3mno/HWmAr4GpJZ+cgm+O0BClzopMlPQKcDdwP7G5mxxFCZx1aoU2lJF+TJD0oaZ6k2ZL2juWSdF5M8jVf0jsz99BxcibF7Wcr4BAze6a40Mz6JH20QptKSb7OBr5vZjdLOihe7wd8mGAy7gy8G/hl/HScpqWmkUjSMGBqqQIVMLPHKpQ/Z2Zz4vnLQCHJlwGFFJNbAH+L5wcDl1jgQWBLSWNq7YzjNIKaRiIzez3mUB1rZn8eyBeVJPk6BbhV0jkERd43VquU5Ou5kmd5fiKnaUhZWNgKWCTpTkkzCkctDUuTfAHHAV8xsx2BrxBW/GrGzKaZWbeZdY8ePTqlqePUnZQ50RkD+YJySb6AzxFywAL8lpAIGWpM8uU4zUTKe6J7CBF9OuP5w8Ccam0qJfkizIH+JZ7vDzwRz2cAR8VVun2AlTF5suM0LSmBSr5ImIe8iRCwZHtChrsPVGlWKcnXF4GfSdoE6I3PBbgJOAh4EngFOKZW+RynUaSYc8cDexMWBjCzJyRtU62Bmd1HZcfUvUoL4svc48vUdZymJWVh4VUzW5flO44ivofIaXtSlOgeSd8GNpN0AGFB4Pp8xHKc1iFFib4JLAMWAF8izF9Oz0Mox2klUgKV9AG/iofjOJGU1bmnKTMHMrMJdZXIcVqM1DDCBbqAwwjL3Y7T1qS8bF1RdPzVzP4T+Eh+ojlOa5BizhXv7ekgjExJqVkcZyiSogQ/Yf2caC3BBeiwegvkOK1GihLdQFCiggeCAR8txCsp8Y1znLYhRYn2At4FXEdQpI8Bs1jvPOo4bUmKEu0AvDPuUEXSmcCNZvbZPARznFYhNe7ca0XXr8Uyx2lrUkaiS4BZkq6N158ALqq3QI7TaqS4/fxI0s3A5Fh0jJnNzUcsx2kdkt7zxMg9VXezOk674alVHCcjrkSOkxFXIsfJiCuR42TElchxMuJK5DgZcSVynIzkqkRV8hNdFXMTzZPUUxTYEUnfivmJ/ijpQ3nK5zj1IO9NdWXzE5nZEYUKkn4CrIznuwFTgYnAm4E7JO1iZq/nLKfjDJhcR6Iq+YmAdbG6DweuiEUHA1ea2atm9jQhnPDeecroOFkZtDlRSX6iApOBpWZW2JNUKT9R6bOOjWkqZy9btiwniZ2hRF+fsWTZKh54ajlLlq2ir69+wXsHJUZCmfxEBT7F+lGoZsxsGjANoLu720MZO1Xp6zNuWfQ8p06fR++aPro6Ozj38ElMmbgdHR2VQsXXTu4jUYX8RIVY3ocAVxVV9/xETt3pWbF6nQIB9K7p49Tp8+hZsbouz897da5SfiKADwKPm9mzRWUzgKmSNpU0npAAeVaeMjpDn6Uv9a5ToAK9a/p44eXeujw/75GokJ9o/6Il7YPivamUmHJmtgiYDiwGbgGO95U5Jyvbjuyiq3PDP/Wuzg62GdFVl+crpARqXbq7u2327NmNFsNpYhLmRAOaIHnwRWfI09Ehpkzcjl1PmswLL/eyzYguxo0aXpdFBXAlctqEjg4xYfTmTBi9ef2fXfcnOk6b4UrkOBlxJXKcjLT86pykZcAzjZajAlsDyxstRJ0Zyn1abmZTUhu3vBI1M5Jmm1l3/zVbB+/Txrg55zgZcSVynIy4EuXLtEYLkAPepxJ8TuQ4GfGRyHEy4krkOBlxJRogkrokzZL0aIxk9P1YPl7SQzFi0VWS3hDLN43XT8b74xragTJU6dMJUW6TtHVRfUk6L96bX5Jhvmmo0q/LYlSphZJ+HTeQpvfLzPwYwEFwm988nncSYkfsQ9gPNTWWnw8cF8+/DJwfz6cCVzW6Dwl92hMYR8gYv3VR/YOAm2O7fYCHGt2HxH4dFO+JsLet8Fsl9ctHogFigVXxsjMeBuwPXB3LLyZkFIQQyejieH418AEVUq83CZX6ZGZzzaynTJODgUtiuweBLSWNGSRxa6ZKv26K94ywg3qHWCepX65EGZA0LAaefAG4HXgKeNHM1sYqxdGK1kUyivdXAqMGVeAaKO2TmT1UpXpN0ZmagWr9imbckYTd1JDYL1eiDJjZ62Y2ifA/2N7Aro2VKDulfZL0jgaLVBf66dd/A/ea2R8G8mxXojpgZi8CM4H3EIb+wmbH4mhF6yIZxftbACsGV9LaKepTNYfMlovOVNovSd8DRgOnFlVL6pcr0QCRNFrSlvF8M+AAQoTXmcAnY7XPAdfF8xnxmnj/rmiLNw0V+vR4lSYzgKPiatY+wEozey5/SdOo1C9J/wZ8CPiUmRWHA0rrV6NXTlr1APYA5gLzgYXAd2P5BMIk9Ungt8CmsbwrXj8Z709odB8S+nQSYV6wFvgbcEEsF/ALwlxwAdDd6D4k9mttlH1ePArlSf1ytx/HyYibc46TEVcix8mIK5HjZMSVyHEy4krkOBlxJXKcjLgSOcTk01v3X9MphytRC1DkRtSStLr8/eFKlAOSviPpT5Luk3SFpNMk3S2pO97fWlJPPB8m6T8kPRw3gH0plu8n6Q+SZgCLJf1A0ilF3/EjSSdX+P794vddLenxuPms320XkjaTdLOkL0oaF9teFPtymaQPSrpf0hOS9o5thscNbbMkzZV0cCw/WtIMSXcBd0oaI+lehRxVCyVNzvav3EQ02iVjqB3AXgRXkTcCIwluPqcBdxPdRwgRN3vi+bHA6fF8U2A2MB7YD1gNjI/3xgFz4nkHwSVlVAUZ9iNstdgh1n0AeF8VmXvi8+8Ajir6vrXA7vEZjwC/JrjEHAz8Ptb7MfDZeL4l8CdgOHA0wVXoTfHeV4HvxPNhwIhG/1b1Oob0MNsgJgPXmtkrAHEkqcaBwB6SCk6rWxDSbL4GzDKzpwHMrEfSCkl7AtsCc82smhf4LIupPOM+mnHAfVXqXwecbWaXFZU9bWYL4jMWAXeamUlaEJ9XkP/jkk6L113A2Hh+u5n9PZ4/DBS2YP/ezOZVkaWlcHNu8FjL+n/v4jyHAk40s0nxGG9mt8V7pZl5LyD8D38MYVSoxqtF56/Tfy6q+4EpJWZf8TP6iq77ip4n4NAi+cea2WOl8pvZvcA/E7YUXCTpqH7kaRlcierPvcAn4vxiBPCxWN5DMPVg/VYJgFuB44qCZOwiaXiFZ19L2AfzrtiunnwX+AfBezmFW4ETC8oXR8qNkPQWYKmZ/Yrwn0FTBjUZCK5EdcbM5gBXAY8Sgl08HG+dQ1CWuYQ5UYELCIme50haCPwPFUYNM3uNsF9puuWTEPpkYDNJZye0+SEhZsH8aPL9sEK9/YBHY/+PAH6WRdBmwrdC5IykM4FVZnZOHZ7VAcwBDjOzJ7I+z6kPPhK1CJJ2I6z03ekK1Fz4SNTCSNoduLSk+FUze3eF+tcSls+L+YaZ1Xt+1Va4EjlORtycc5yMuBI5TkZciRwnI65EjpOR/wedEK9zJEHLIwAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.relplot(data=p66_cd47_df, x=\"query_n_kmers\", y=\"query_n_unique_kmers\", height=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "id": "fabf3a25-b8e1-4a83-9618-edfddf8939f6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T19:00:34.951942Z", + "iopub.status.busy": "2024-11-12T19:00:34.951149Z", + "iopub.status.idle": "2024-11-12T19:00:35.041065Z", + "shell.execute_reply": "2024-11-12T19:00:35.040683Z", + "shell.execute_reply.started": "2024-11-12T19:00:34.951924Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 194, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAANEAAADQCAYAAACZZoRKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUmklEQVR4nO3df7TUdZ3H8efr4q1rCv4gEDMRMNSkjPKumelWtBZhRmmhmabm5m4HI2Pbflpq7XZObtFq7qmlNcW2RNLa0MxSI7MiEYRUsI4KZLr8NhBoLwL3vX98P3MZhpm587nf+d6Z78z7cc6cO/Od73fmM/fM+3w/38+8P5+3zAzn3MB1NLoBzuWdB5FzKXkQOZeSB5FzKXkQOZdS7oNo8uTJBvjNb4NxKyv3QbRx48ZGN8G1udwHkXON5kHkXEoeRM6ltF+jG5A3vb3G6k3bWfd8D4cN62LM8APo6FCjm+UayIMoQm+vcffytcyct4yenb10dXYwa9pEJk8Y5YHUxrw7F2H1pu19AQTQs7OXmfOWsXrT9ga3zDWSn4kqKNdtW/d8T18AFfTs7OW57TsAvIvXpjyIyqjUbTv2sKF0dXbsFUhHDd+fZzf3cP4Ni7yL16a8O1dGpW7bkA6YNW0iXZ3Jv62rs4MvTX01n7r9Ee/itTE/E5VRqdu29vkeJk8YxXEzTmP91h5GDu2quO/6rT2MGX6Aj+S1AQ+iMg4b1rVPt62rs4ORQ7vo6BDjRhzIuBEH7vVc6b6jhnX5SF6b8O5cGWOGH7BPt23WtImMGX5Azfvu7sVH8tqEn4nK6OjQPt22Sl2xSvs+uGpTxW5e8VnM5Z8HUQXlum0x+1bqEo4a1sXKDdv8OqmFeHcuI+W6edef91pWrNnKlOse4P3ffpAp1z3A3cvX0ttbcaqKywHlfcms7u5uW7x4caObUVbhB9tCN88MzvjGA/ucne6acZp38fKhbJfBu3MZKu3mLXxqow+HtyAPokFU7TrJh8Pzy6+JBpEPh7emTM9EkrqAXwEvDu91m5ldKeky4HLgaGCEmW0M+wu4FpgC/BW4yMwezrKNg2kgw+HezWt+WXfndgCTzGybpE7g15J+CvwGuBP4Zcn+7wDGh9vrgW+Gvy0jdjjcu3nNL9PunCW2hYed4WZmttTMVpc5ZCpwczjud8DBkg7Pso3NwLt5+Zb5wIKkIcAS4BXAf5jZg1V2PwL4c9HjZ8K2NSWveSlwKcDo0aPr2t5GiO3m+fyl5pJ5EJnZbmCipIOBH0l6lZk9lvI1ZwOzIfmdKH0rG6/Wbp7PX2o+gzY6Z2abgQXA5Cq7PQscWfT45WFbWyrXzfP5S81nQGciSR3AgWb2fD/7jQB2mtlmSfsDpwNfqXLIfOAySXNJBhS2mNmaKvu3tHLdPJ+/1HxqDiJJ3wf+EdgNPAQMk3Stmf1blcMOB+aE66IOYJ6Z3SlpBvBJYBTwiKS7zOzvgbtIhrefJBnivnggH6qV+Pyl5ldz7pykZWY2UdIHgNcBnwaWmNkJWTawP82cO5eFaus/eF5e5lLnznWG33reDVxvZjsltcRFfZ74/KXmExNE3wJWA78HfiXpKKDqNZHLhs9fai41BVEYSFhnZkcUbXsaeEtWDXNxCiN5xd28wvwlv07KVsw10WIz6864PdHa7ZqoGp+/lLnU10T3SvoEcCvQ96OEmT2XsmEN1UoL1Pv8pcaICaJzwt/pRdsMGFe/5gyuVl+g3hNbB0fNGQtmNrbMLbcBBK2/QL0ntg6OmB9bXwLMBEab2aWSxgPHmtmdmbUuY9V+/W+FawafvzQ4YrpzN5JkY58SHj8L/IBkXlAuVVvptFX4/KXsxSSgHm1m1wA7Aczsr1QYrciLmJVOW4l38+or5kz0QkgiNQBJR5PMXM2tmJVOW4nPX6qvmCC6ErgbOFLS94A3Ahdl0ajBFLPSaSvx+Uv1EzM6dw9wFkng3AJ0m9kvs2mWawSfvzQwsfOJjgCGhOP+VhJm9sP6N8s1Quz8pXY7e1cSM8T9HeAEYDlQ+K8a4EHUQmqdvzRyaFdLZXukEXMmOtnMjs+sJa4plUtsnTVtIqMPeYkPhwcxCag3AF8zsxXZNimOJ6BmrzSxtfCD7JTr2i65NXUC6s3AQklrSYa2RbKGXENntrrslevi+VoPe8QE0Q3ABcCj7Lkmcm3Ksx72iMlY2GBm881slZn9qXDLrGWuqXnWwx4xZ6KlYcWfOyjKVPAh7vbkWQ97xATR/iTB87aibT7E3cY86yERMzp3aOksVkljzWxVJi2rkY/ONZdyEx1nX9DNpd9d3AojealH5+6Q9I7CqqeSXkkyFeJVFd+xcn2iscBcYDjJ9IoLzOwFSS8mGQU8EdgEnFOheoRrUu24amtMEH2ZJJDOAI4l+bJ/oJ9jKtUnmgl83czmSvoWcAlJLaJLgL+Y2SsknUuy5PA5lV7cNad2W7U1JgH1J8DXgZ8DNwHvMbNl/RxTtj4RMAm4LWyfQ7IgJCT1ieaE+7cBbw3V81yOtfpIXr9nIknfIMwhCg4CniJZeB4zm9HP8XvVJwrHbjazXWGXQg0iKKpPZGa7JG0h6fJtLHnNlqpP1OpafSSvlu5c6VX7kpg3KK1PBBwXc3yF12y5+kStrpVH8voNIjOb098+AJJuN7Ozq7zOZkkLgDeQlJHcL5yNimsQFeoTPSNpP5Kz3qZa3t/lT7nk1i9NffVeI3mFLt5xTTySV89Kefssn1WlPtEC4L0kI3QXAj8Oh8wPjxeG539htY7Bu9xplflL9Qyicl/2SvWJVgBzJf0LsJQkL4/w97uSngSeA86tY/tcE2qF+UuZ1mw1s0eA15bZvhI4qcz2HuB9WbbJNbc8zl+qOWOh3xeSlprZPgGTNc9YaD1NPH8pdcYC4bpmtJn9sczTnxpIq5wrlbf5SzFrLJwJfBV4ETBW0kTgi2b2LgAz+3kmLXSO5p6/FDOf6CqS65jNACFbYWzdW+RcGc2c9RDTndtpZltKsnB8+NkNimbOeogJouWSzgOGhIoQM4DfZtIq58po1qyHmO7cR4EJJJnZt5AUPb687i1yLkIzrNpa85koVIH4XLg51xSaYf5SzOjcAspcA5nZpFQtcC6lRs9fiunOfQL453D7PLCMfTO8nWu4wR7Ji+nOlU6B+I2kRane3bkMDKTMZpqsh5ju3KHF7SRZB+GgAb9zAzRrAqOrv9gymys3bBvw9yJmiHsJyTWRgF3AKpI1EXKh3Co0zZLA6AZHueTW6897LSvWbE31vahbAmqj1JqAunLDtmZIYHQNVprcagZnfKPm70W6BFRJZ1V7vtlXQs3jZC9Xf6XdvIVPbUz9vYjpzl0CnAL8Ijx+C0nGwgZysBJqpf7wyKFdDWyVa7R6fC9ihrg7gePN7OywlsIEoNPMLjazD0W8TkNUGvYcM/yABrfMNVI9vhcxywg/bmavLHrcASwv3tYIMZPyyk328kEFF/G9SD0p7z5JPyPJm4NkZdJ745rbWOWGPZ1L+72I+bH1sjC4cFrYNNvMfjSgd3WuhURNDw8jcE09gODcYKtlGeFfm9mpkraydwJqoWbrsMxa51wO1LIC6qnh79Dsm+Nc/sQMcSNpiKSXSRpduPWz/5GSFkhaIWm5pI+F7a+RtFDSo5LukDSs6JjPSHpS0h8lvX1gH8u5wROTsfBR4EpgHXuqhxtwQpXDdgH/ZGYPSxoKLJF0D/BfwCfM7H5JHyJMr5B0PMmqpxOAlwH3SjomLIrvXFOKGVj4GHCsmdW8wLyZrQHWhPtbJT1OUj7lGJIKegD3AD8jmaM0FZhrZjuAVWE54ZNI1uZ2rinFdOf+DGwZ6BtJGkOypPCDwHKSgIFk2eAjw/2++kRBce0i55pSzJloJfBLST8hWawEADOb1d+Bkg4EbgcuN7PnQxfuOkmfJ6kE8UJMo73Il2smMUH0dLi9KNxqEmq13g58r5DpbWZ/AN4Wnj8GOCPsXqhPVFBcu6iPF/lyzSQmY+Hq2BcP9VZvAB4vPmNJGmlm60P+3RXAt8JT84HvS5pFMrAwHvAp6K6pZb3azxuBC4BHJS0L2z4LjJc0PTz+IXBjeK3lkuYBK0hG9qb7yJxrdjFZ3CcWPewCzgZ2mdkns2hYrby0ihtE6bK4fbUf58prq9V+nMtC26z241xWYrpzVWsRSTrdzO5J3yTn8iUqAbUfX6njazmXG/UMIl+swLWlegaRZw64tlTPIHKuLdUziFbX8bWcy42ohUoknQKMKT7OzG4Of6suM+xcq4r5sfW7wNEkxb0K+WwG3Fz/ZjmXHzFnom6SZYR9AMG5IjHXRI8Bo7JqiHN5FXMmeimwIiSdFs9sfVfdW+VcjsQE0VVZNcK5PIvJnbu/2vOSFprZG9I3ybl8qefvRF4ty7UlT/txLiVP+3EuJc/idi6lmoNI0kclHVJllwvq0B7ncifmTHQY8JCkeZImhzXl+pjZY/VtmnP5UHMQmdkVJIsp3gBcBDwh6cuSjs6obc7lQtQ1UcibWxtuu4BDgNskXZNB25zLhZhroo9JWgJcA/wGeLWZfYRk6ayzKxxTqcjXREm/k7RM0mJJJ4XtknRdKPL1iKTXpf6EzmUsJu3nEOAsM/tT8UYz65X0zgrHVCrydQ1wtZn9VNKU8PjNwDtIuozjgdcD3wx/nWtaNZ2JJA0Bzi0NoAIze7zC9jVm9nC4vxUoFPkyoFBi8iDgf8P9qcDNlvgdcLCkw2v9MM41Qk1nIjPbHWqojjazpwfyRiVFvi4HfibpqySBfErYrVKRrzUlr+X1iVzTiBlYOARYLuk+SfMLt1oOLC3yBXwE+LiZHQl8nGTEr2ZmNtvMus2se8SIETGHOld3MddEnx/IG5Qr8gVcSFIDFuAHJIWQocYiX841k5jfie4nWdGnM9x/CHi42jGVinyRXAO9KdyfBDwR7s8HPhhG6U4GtoTiyc41rZiFSj5Mch1yKMmCJUeQVLh7a5XDKhX5+jBwraT9gJ7wugB3AVOAJ4G/AhfX2j7nGiWmOzcdOIlkYAAze0LSyGoHmNmvqZyYemLphvBj7vQy+zrXtGIGFnaYWV+V73AW8TlEru3FBNH9kj4L7C/pdJIBgTuyaZZz+RETRJ8GNgCPAv9Acv1yRRaNci5PYhYq6QW+HW7OuSBmdG4VZa6BzGxcXVvkXM7ELiNc0AW8j2S427m2FvNj66ai27Nm9u/AGdk1zbl8iOnOFc/t6SA5M0WVZnGuFcUEwdfYc020iyQF6H31bpBzeRMTRHeSBFEhA8GAdxbWKynJjXOubcQE0YnA3wA/JgmkM4FF7Ekeda4txQTRy4HXhRmqSLoK+ImZnZ9Fw5zLi9h1514oevxC2OZcW4s5E90MLJL0o/D43cBN9W6Qc3kTk/bzr5J+CpwWNl1sZkuzaZZz+RH1O09YuafqbFbn2o2XVnEuJQ8i51LyIHIuJQ8i51LyIHIuJQ8i51LyIHIupUyDqEp9oltDbaJlklYXLeyIpM+E+kR/lPT2LNvnXD1kPamubH0iMzunsIOkrwFbwv3jgXOBCcDLgHslHWNmuzNup3MDlumZqEp9IqBvre5pwC1h01RgrpntMLNVJMsJn5RlG51La9CuiUrqExWcBqwzs8KcpEr1iUpf69JQpnLxhg0b9nmv3l5j5YZtLHxqIys3bKO31xdqddkZlDUSytQnKng/e85CNTOz2cBsgO7u7r0ipLfXuHv5WmbOW0bPzl66OjuYNW0ikyeMoqOj0rLgzg1c5meiCvWJCmt5nwXcWrR76vpEqzdt7wsggJ6dvcyct4zVm7YP8BM4V13Wo3OV6hMB/B3wBzN7pmjbfOBcSS+WNJakAPKimPdc93xPXwAV9OzsZf3Wnuj2O1eLrM9EhfpEk4qGtKeE586lpCtnZsuBecAK4G5geuzI3GHDuujq3PtjdXV2MHJo1wA/gnPVKSkJlF/d3d22ePHivsd+TeQyVPYL1HKLL3Z0iMkTRnHcjNNYv7WHkUO7GDP8AA8gl5mWCyJIAmnciAMZN+LARjfFtQHPnXMuJQ8i51LyIHIupdyPzknaAPyp0e0IXgpsbHQjMtTun2+jmU0u3Zj7IGomkhabWXf/e+aTf77yvDvnXEoeRM6l5EFUX7Mb3YCM+ecrw6+JnEvJz0TOpeRB5FxKHkQDJOk7ktZLeqxo26GS7pH0RPh7SCPbmEaVlZpa4jNK6pK0SNLvw+e7OmwfK+nBsOLUrZJe1N9reRAN3E1A6Q9vnwbuM7PxwH3hcV4VVmo6HjgZmB5WY2qVz7gDmGRmrwEmApMlnQx8Bfi6mb0C+AtwSX8v5EE0QGb2K+C5ks1TgTnh/hySaoK5VGWlppb4jJbYFh52hpsBk4DbwvaaPp8HUX0dZmZrwv21tEhN25KVmlrmM0oaEhYOXQ/cAzwFbDazXWGXsqtNlfIgyoglvx3k/veDKis15f4zmtluM5tIsiDOScBxA3kdD6L6WifpcIDwd32D25NKhZWaWuozApjZZmAB8Abg4LASFdS42pQHUX3NBy4M9y8EftzAtqRSZaWmlviMkkZIOjjc3x84neS6bwHw3rBbTZ/PMxYGSNItwJtJ0ufXAVcC/0OyWtFokukZ08ysdPAhFySdCjwAPAoU1iD7LMl1Ue4/o6QTSAYOhpCcTOaZ2RcljQPmAocCS4HzzWxH1dfyIHIuHe/OOZeSB5FzKXkQOZeSB5FzKXkQOZeSB1GLkDSmOKO8n31/m3V72okHURsys1Ma3YZW4kHUgiSNk7RU0pvCnJllkh6RND48vy38/WJRyZtnJd0Ytp9fdNx/ShrSyM/T7DyIWoykY0ny3S4iSV+5NiRZdpNkJfcxsy+E595MMq3jekmvBM4B3hie2w18YHBan08tWRWijY0gyfU6y8xWSFoIfE7Sy4EfFhWY7hNy5P4bmGVmSyRdBpwIPJQ8xf60QJJplvxM1Fq2AE8DpwKY2feBdwH/B9wlaVKZY64CnjGzG8NjAXPMbGK4HWtmV2Xe8hzzIGotLwDvAT4o6byQTLnSzK4jOUOdULyzpDNJaufOKNp8H/BeSSPDPodKOmpQWp9T3p1rMWa2XdI7SWZq3gGcI2knySzUL5fsPpNk5uai0HWbb2ZfkHQF8HNJHcBOYDrNUzSg6XgWt3MpeXfOuZQ8iJxLyYPIuZQ8iJxLyYPIuZQ8iJxLyYPIuZT+H395FTjbDVZsAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.relplot(data=p66_cd47_df, x=\"ksize\", y=\"query_n_unique_kmers\", height=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "f3906de8-ce4d-4a62-9359-49fa6670318e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-12T19:00:35.042994Z", + "iopub.status.busy": "2024-11-12T19:00:35.042312Z", + "iopub.status.idle": "2024-11-12T19:00:35.195319Z", + "shell.execute_reply": "2024-11-12T19:00:35.194994Z", + "shell.execute_reply.started": "2024-11-12T19:00:35.042977Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAADRCAYAAAAKRcT2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAdpUlEQVR4nO3deXRddbn/8fcnzXSaJmnSuaWFAi0UBNtSWqYrUBXrxKAgiAP89C4c4AJXXQuvuBTx5116Lw4o/hyuonAVFBAUEcHKIIMFOtLSltJSWjq3SZtmajM+vz/2N+U0TdLTnHNykuzntdZZ2WeP35P2OXvvb/bzfWRmOOcGv7xcN8A51zc82J2LCQ9252LCg925mPBgdy4mBk2wz5s3zwB/+StTr0Fn0AR7VVVVrpvgXL+W1WCXNE/SGknrJH25i+XvkLREUqukSzstu0rS2vC6KpvtdC4OshbskoYAPwbeC5wEfFTSSZ1WexO4Grin07aVwNeBOcBs4OuSKrLVVufiIJtn9tnAOjNbb2bNwO+Ai5JXMLMNZrYcaO+07XuA+Wa228z2APOBeb1pRFt7G23tbb3Z1LlBJT+L+54AbEp6v5noTN3bbSd0XknSNcA1AJMmTTpoWWt7CzvrtrB6x2LazZg2ZgZjSyeRP6TgCD6Cc4PHgO6gM7Ofm9ksM5s1atSog5btqt/G06//iR31m9nVsIVn1j/CzvrNOWqpc7mXzWDfAkxMen9UmJftbQFYX73qkHlrd63AE39cXGUz2BcCUyRNllQIXAE8nOK2jwMXSKoIHXMXhHkpKxxSeMi8/CGFSDqS3Tg3aGQt2M2sFbiOKEhXA/eZ2UpJt0q6EEDS6ZI2A5cBP5O0Mmy7G/gm0RfGQuDWMC9lx1ROI09vfTwhpow6JQOfzLmBSYPlsnbWrFm2aNGiA+/NjOrG7WzZ+wZmxoTyyYwoGXvQF4BzPRh0l4DZ7I3PKUmMLBnHyJJxuW6Kc/2Cn+aciwkPdudiwoPduZjwYHcuJjzYnYsJD3bnYsKD3bmY8GB3LiY82J2LCQ9252LCg925mPBgdy4mPNidiwkPdudiwoPduZjwYHcuJjzYnYsJD3bnYsKD3bmY8GB3LiZyXcW1SNLvw/IXJR0T5hdIukvSCkmrJf1HNtvpXBzkuorrp4E9ZnY88H3gO2H+ZUCRmZ0CnAZ8puOLwDnXOzmt4hre3xWmHwDeqahkiwElkvKBBNAM1Gaxrc4NetkM9lQqsR5YJ1SQ2QuMIAr8BmAbUQ332460Ioxz7mD9tYNuNtAGjAcmA1+UdGznlSRdI2mRpEW7du3q6zY6N6DkuorrgXXCJXs5UA1cCTxmZi1mthN4HpjV+QA9lWx2zh0s11VcHwauCtOXAk9aVHzuTWAugKQS4Azg1Sy21blBL6dVXIFfAiMkrQO+AHT8ee7HwLBQ1XUh8CszW56ttjoXB4O2iqtzaRp0VVz7awedcy7DPNidiwkPdudiwoPduZjwYHcuJjzYnYsJD3bnYsKD3bmY8GB3LgMkHSPplRTX/We229MVD3bn+piZnZWL43qwO5dhko6VtFTSuZJekrRM0nJJU8Ly+vDz1rBsmaQtkn4V5n88abufhVGf0ubB7lwGSToB+ANwNVEm5+1mNp0oRXtz8rpm9rWw7DxgN3CHpGnA5cDZYVkb8LFMtC0/EztxzgEwCvgT8CEzWyVpAXCzpKOAB81sbecNwjBsvwG+Z2aLJV1HNO7iwmgRCWBnJhrnZ3bnMmcv0VgM5wCY2T3AhcA+4FFJc7vY5hZgs5n9KrwXcJeZTQ+vE8zslkw0zoPducxpBi4BPinpyjCU2noz+yHRGf/U5JUlfRB4F3B90uwngEsljQ7rVEo6OhON88t45zLIzBokfQCYD/wZuFxSC7Ad+M9Oq3+BaNDVl8Il+8Nm9jVJXwX+JikPaAGuBTam2zYfvMK5rvngFc65gcmD3bmY8GB3LiY82J2LiX5ZxTUsO1XSAkkrQzXX4my21bnBrl9WcQ3VYX4DfNbMTiZ6nLAlW211Lg76axXXC4DlZvYygJlVm1lbFtvqXL8jaUO4ql0mKe2/K2fzoZquqrjO6W4dM2uV1FHFdSpgkh4net74d2b2X1lsq3O9ds+S2/OAecBMYAnw2JUzb2jP0O7PN7OqTOyo22CX9COiOuldMrPru1uWAflEzxefDjQCT0habGZPdGrjNcA1AJMmTcpic5zrWgj0h4iege/w8D1Lbr8kgwGfET1dxi8CFgPFRN9Ya8NrOlCYwr7TqeK6GXjGzKrMrBF4NLThIF7F1fUD8zg40Anv52Vg30b02OzicGJLS7fBbmZ3mdldRA/vn2dmPzKzHwHvJAr4w0mniuvjwCmShoYvgXOBVUfwuZzrK4echIIZGdj3OWY2k6iT+1pJ70hnZ6l00FUAZUnvh4V5PUqniquZ7QG+R/SFsQxYYmZ/SekTOde3lnQzf2m6OzazLeHnTqJbhdnp7C+VDrpvA0slPUWUHPAOohzcwzKzR4kuwZPnfS1pej9wWTfb/oboz2/O9WePEV2hHnTPHub3mqQSIM/M6sL0BcCtae2zp6y3kGJ3BrCet3rSXzSz7ekcNBs8681lWMpZb0m98TOIzuhp98aHXPiHwtt84B4z+1Za+zxciqukpWaWifuPrPJgdxkWyxTXJyR9ODzs4pwboFIJ9s8A9wNNkmol1UmqzXK7nHMZdtgOOjMr7YuGOOeyK6XHZSVVAFOIHrABwMyeyVajnHOZd9hgl/SvwA1ET8AtI+qdXwB0NSyuc66fSuWe/QaiZ9Q3mtn5RH9eqMlmo5xzmZdKsO8PD78gqcjMXgVOyG6znHOS7pS0M7k6bBhHfr6kteHnYZ9m7ZDKPftmScOBPwLzJe0hA2NYOzdYXP/nQ1Ncf/jBjGS8/Rq4A7g7ad6XgSfM7Nth9KcvAzelsrMjGjde0rlEmWmPhQEp+g1/qMZlWErPlYRAPyTFFbgkEwEfhmp7xMzeFt6vIUpM2yZpHPC0maV0pX3Yy3hJZ0gqBTCzfwBPk5mMHucGg2ymuHZljJltC9PbgTGpbpjKPftPgPqk9/VhnnMuuymuPQrp4ClfmqcS7LKka30za8drxDnXIWsprt3YES7fCT9TLuecSrCvl3S9pILwuoEoC84591aKa7K0U1x7kDzgy1VE1WFTkkqwfxY4i2gIqY5BI9MeIse5wSB0wl0CvB/4aviZqc65e4keYDtB0mZJnyYaX+LdktYSlXv+dsr78yquznVp0GV5ptIbf1f4O3vH+wpJd2a1Vc65jEvlMv5UM6vpeBPGh/M/vTk3wKQS7HnJj+RJqsR7450bcFIJ2u8CCyTdT3QfcymQ1lhYzrm+l8rgFXdLWgycH2Z9yMx8DHfnBpiUCjua2UrgPqK/8dVLSqnWUjolm8PySZLqJX0pleM557qXSm/8heFvem8A/wA2AH9NYbtel2xO8r1UjuXcYNRNiustkraEyq7LJL0v1f2lcs/+TaLRaf5uZjMknQ98PIXtDpRsDo3sKNmcfAtwEW8VnHgAuEOSzMwkXUz0BdOQygdxLlfe/eMfHJLiOv/aG7OV4grwfTO77Uh3lsplfIuZVRP1yueZ2VPArBS266pk84Tu1gnlovYSlYMaRpSj+42eDiDpGkmLJC3atWtXCk1yLrNCoD8E/IXoxPgX4KEwPy1hnMfd6e6nQyoNqgnB9yzwW0m3k/2z7S1E3171Pa3kVVxdP9DXKa4A10laHi7zUx6pJpVgv5CoRvoNRA/3rwM+kMJ26ZRsngP8l6QNwI3AVyRdl8IxnetrfZ3i+hPgOKJKytuI/jSekm7v2SU9Z2bnADt4K2e243nh/ytpN/DfZvb/utnFgZLNREF9BXBlp3U6MngWcHDJ5n9JasctQL2Z3ZHqh3KuD/VpiquZ7eiYlvQ/wCOpbttTffZzws9SMysLr9LwKie6b7+hh+17XbLZuQGkT1NcO3LZg0uAV7pb95Bt08l6kzQuaYicnPKsN5dhKWe9JfXGH6jimone+JDieh4wkugK++vh/XSiq+0NwGdSjUFPcXWua/FLcXXODQ4e7M7FhAe7czHhwe5cTHiwOxcTHuzOxYQHu3P9kKSJkp6StErSylCvIetVXJ1zPZhy022HpLiu/c6X0n2ophX4opktCbUWF0uaD1xNL6u4+pnduTSEQD8kxTXM7zUz22ZmS8J0HdEj5xOIxoC4K6x2F3Bxqvv0YHc0tzbR2t6a62YMVFlPcQ3Dtc0AXiSNKq5+GR9jDc0NbNi9ltU7XqG0qIy3j5/FmNJxSIPuSdFs6inF9dF0dx7GkvgDcKOZ1Sb/24QRnTJaxdUNQmbGa7tW8s8Nz7Bn327erNnAo6sforqxKtdNG2iyluIqqYAo0H9rZg+G2Vmt4uoGocbmBpZvPfj/Y5u1sbvRh/c6QllJcVV0Cv8lsNrMvtdp31mr4uoGIeXlUTCk4JD5Q+R3dkci9LofUsU1A73xZwOfAOZ2GknWq7h6iuuRW1u1hqfXPX7g/dCCEj5w0ocoT6T8p9vBbNB1XPjXeIwdM3wy7512MVtq3qSkcBgTyid6oA9iHuwxVpBfyFHlkziqPKUCP26A83t252LCg925mPBgdy4mshrsva3iKundkhZLWhF+zs1mO52Lg6wFe5pVXKuAD5rZKUQPDvxvttrpXH/UQ4prVqu49lY6VVyTH+1aCSQkFZlZUxbb61yvHPuRbx2S4rr+vpuzleIKWazi2lu9ruLaaZ0PA0u6CnSv4upyLQT6ISmuYX6v9ZDi2mv9uoNO0slEl/af6Wq5V3F1/UBfp7hCFqu49lY6VVyRdBTRN+Ynzez1LLbTuXRktYpr5xRX0qjims1gP1DFVVIhURXXrrKDOjJ4DlRxlTSc6HLoy2b2fBbb6Fy6+jTF1cx2mFmbmbUD/0PUN5aSrAV7mlVcrwOOB76W1Os4OlttdS4NfZrimrMqrv2JZ725DEs56y2pN/5AFdd0e+MlnQM8C6wAOvb1FeCjeBVXD3aXUYMuxbVf98Y75zLHg925mPBgdy4mPNidiwkPdudiwoPduZjwYHeuH5JULOklSS+HFNdvhPmTw9gP68JYEIWp7tMHnHQuTafM/OIhKa4rlnw33RTXJmCumdWHx2afk/RXoidNv29mv5P0U6IxIX6Syg79zO5cGkKgH5LiGub3mkXqw9uC8DJgLtHYD+BVXJ3rU1lLcZU0RNIyonpu84HXgZqQdwJdjxHRLQ9212/UNdWzZe8WqhqqaGtvy3VzUpW1FNeQ3TadKD18NnBiOvvze3bXL2yv286fV/2FhuYG8pTHWUefyanjTqEwP+X+p1zJWoprBzOrkfQUcCYwXFJ+OLt3NUZEt/zM7nJuf8t+nlj7FA3NDQC0WzvPbXieqoYBUT46Wymuo8K4DkhKAO8mShV/imjsB/Aqrm6g2deyj10Nh44hWNtUl4PWHJnQ635IFdcM9MaPA56StJxoIJj5ZvYIcBPwhTAGxAiinPeU+GW8y7nigmIqE5Xs3rf7oPmlRcNy1KIjEwL70fDKCDNbThf3/WG05pRHp0nmZ3aXc4mCBO+aOpfi/OID8+ZMms2oEh9ENJP8zO76hfFl4/nojCuo3b+XovxiKhMV5A/x/56Z5L9N12+UF5dRXlyW62YMWn4Z71xMeLA7FxNZvYyXNA+4HRgC/MLMvt1peRFwN3AaUXGIy81sQ1j2H0QP+bcB15vZ49lsa1xt27aHNzdVUVJSRGlpgq3baygvTzA0UcjWXXupHD6MwoIhbK+qY0TFMJQndu2pZ3RlKW3WTvXeRsaOKGV/Sys1DfsZP6KMvfuaaGxqZnxFGdX1DbS0G2OHl7Kzto485TG6rISte2spys9nxLAE2/bWMbSwgLJEEdv31lGWKGZoYQE76uqoSAylMD+PXfX1VA4dypC8PKobGxgxtIQ22qndv48RJSW0trVR37yfESWl7Gtpoqm1hRFDS6lrbqStvZ3y4gTVDXuRxMih5dS3NDIkbwijSypJFBTl+p+hT2Qt2JOquL6b6BnehZIeNrPkwo4HqrhKuoKo1NPlodrrFcDJwHjg75KmmtmAeYZyIFi1ahOfv/4XVO+O8i3eO28GTW1tPPnsKi67ZA7rt1SzeMVGrr7iHJ5d8jqvb6ricx87j/ufXEpVTQM3fGwuP/3zC7S0tXHj5e/g2w/8g2HFhVx70dncev+TjK8s5RNzZ/Kff/oHU8eO4AOnn8jtf1vAjKPHcvpxR/Gr5xZxztSjOaqyjAeXvMK8t01FecaTr67jQzNPYVttDUs3beHK2TNZtnkTb1RXcdWcOcx/bRV79jVy1Zwz+OOKxbS0tfHx08/kvpcXUJifz2Wnns69y56lIjGMeSe+nT+88hzjSis4Y9IJPL72BY4ePoYTR03guY1LmTbqWC6cdi7DE6U5/tfIvmxexh+o4mpmzUBHFddkFxFl7kCUyfPOMDj+RcDvzKzJzN4A1tHLvy26rjU07ue7tz9yINAB/vrYUk4+YQJmxn0PvsCZM4+jrb2dO+99lgvOPomW1jZ+es8/uOjcU9nf3MrPH3iWi//lZOoam7jr0UV8cM40qmobeXjBKs5/22Q2V9eyeO0W3j5pDK9tr2bb7jqOqihj6cbtYFBaXMRzr21k5LBhFAzJ47FXXmPq6NEY8IclK5g5aSLtZvzmxcWcc9xxtLa3c/dLLzF36ok0tbZyz6KFnHf8NBpbmvnTiqWcefQUavfv4+nX1zB93GSqG+tYuX0LkyvGsq1uD7vqaxmRKGdjzQ5a2oxEQRGrd61nw56tufuH6EP9tYprKtt6Fdc01NbuY9nLGw6Zv39f8yHTZkZzU5Ro1dzSSnt7VGtgT90+EoUFALy5Yw9jyqOHYF55YxtTxo0EYMXG7QemV27aybGjozqEa3fsZmJlOQDba+oYPjQBQENzC3mKhmzf39yR3AXNrdFFXUtbGx21DuqbmigMf57bUVfL8EQJAG/s3sn48koANuzZybiyaHpjzS7GDIuOv61uN5WJ6Phb6nYewW9u4BrQHXRexbX3hpeXMGf2lEPmFycKD5nOyxMFRVFQJYoK6CgrMnJ4CfX7oy+E4yaMZMvuvQDMnHIUKzftAOC04yccmJ5+zFhe214NwAnjRrKxag8A4yvK2N3QCEBJYQHtIZiLC6Nj5kkU5A8BoCg//8DxhycS7G+Jjj+hvILqhujx2qmjxvHmnujLf8qIcWyqiaaPrRzDtvrqsP5IqhprAJhYPvYIfnMDV3+t4prKti4NiUQhN1z3PiZOjM66eXniI5eexZIVG8jPH8KnPnEeT73wKkWF+Vx79Tt55OnllAwt4rpPzuWhJ5cxvDTBNZeew4PPrGB0xTCufM9MHl24homjhnPBrCk89+pGTjxqJNMmjWL11ipmHTuBspJitu+t5/xpk9nX3ExDcwvvf/sJbNqzBzP4yOmnsmLrVvLz8rjqzNNYsP4NivKH8Omz5/DkmjUkCgr41Jln8vc1qygtKuLKWbN5ct1qKhJD+cDJp/LCm2sZPayMs44+nhXbNzKxfATHjRzNpr27OK5yHOXFCfbsq2PaqKNpb2+hqbWZ0yecxDEV43P8r9E3slb+KQTva8A7iQJ1IXClma1MWuda4BQz+2zooPuQmX0k1GW/h+g+fTzwBDClpw46L//UO1XVdWzeXB31xpcl2La9hrLSBImhRWzfuZeK8qEUFOSzc3ctI4YPQ3mwq6aB0RWlNLe1UVO3jzEjSmloaqF+XxPjKsvY27iffS2tjKsopaqugbZ2Y+zwYeyorSdPYnRZCdv21lGUn09lSYJttXUMLSigPFHE9to6hhUXU1JQwI76OoYnEhTmD6Gqvp6KkhLyBLsbG6PeeGujdv9+RpaU0NTaSmNLEyOGltLQsp+WtlYqE8Oob9lHW3s7ZYVDqd5XgyRGDC2nvrmR/LwhjBw6nML8gq5+NYOu/FNWa71Jeh/wA6I/vd1pZt+SdCuwyMwellQM/C/RA/+7gSvCg/5Iuhn4FNBKVJv6rz0dy4PdZZgHe3/lwe4ybNAF+4DuoHPOpc6D3bmYGDSX8ZJ2ARtz3Y5ujAQGxBhLaRpMn7PKzNIeIbY/GTTB3p9JWmRms3LdjmyLy+ccqPwy3rmY8GB3LiY82PvGz3PdgD4Sl885IPk9u3Mx4Wd252LCg925mPBgT5OkiZKekrRK0kpJN4T50yW9IGlZyLmfHeZL0g8lrZO0XFJ3hQH7HUnFkl6S9HL4rN8I8ydLejF8pt9LKgzzi8L7dWH5MTn9AHFnZv5K40VUpmdmmC4lyvQ7Cfgb8N4w/33A00nTfyV69voM4MVcf4Yj+KwChoXpAuDF8BnuI0piAvgp8Lkw/Xngp2H6CuD3uf4McX75mT1NZrbNzJaE6Tqi4nsTAAM6BkEvBzrGProIuNsiLxBV5RzXx83uldDmjnGsCsLLgLlEw4pBNMzYxWG6u2HHXA54kYgMCpepM4jOeDcCj0u6jeh26aywWndDbm3rs4amIQwkuhg4nmhA0deBGouGFYODhxA7aNgxSR3Djg2WR2oHFD+zZ4ikYcAfiHLva4HPAf9uZhOBf+cIqm32Z2bWZmbTiUYPmg2cmNsWuVR5sGeApAKiQP+tmT0YZl8FdEzfz1uj4w6KIbfMrIaoVviZRLciHVeJyZ+nu2HHXA54sKcp3IP+ElhtZt9LWrQVODdMzwXWhumHgU+GXvkzgL1mNlAu4UdJGh6mE0Q1AVYTBf2lYbWrgD+F6YfDe8LyJy301rm+50/QpUnSOcCzwAqgPcz+ClBLVA0nH9gPfN7MFocvhzuAeUAj8H/MbEAMsSPpVKIOtyFEJ4r7zOxWSccS1QWoBJYCHzezpp6GHXN9z4PduZjwy3jnYsKD3bmY8GB3LiY82J2LCQ9252LCg925mPBgjyFJ/+yj41wt6Y6+OJY7PA/2LEt6jLTfMLOzDr/WwBASc1wKPNg7kXSzpNckPSfpXklfkvS0pFlh+UhJG8L0EEn/LWlhGIjiM2H+eZKelfQwsErSrZJuTDrGtzoGueji+OeF4z0g6VVJv+0pLVTSBkkjw/QsSU+H6Vsk3Rn2tV7S9Unb1IefknSHpDWS/i7pUUmXHma/JWG/L0laKumiFH+v75e0IPz+fi3pJ2Fwj/XhM98pabWkXydtc0HYZomk+0OyUUfbviNpCXCZpOsVDR6yXNLvUmlPHPW7s04uSTqNaJCF6US/myVE6Zzd+TTRs+2nSyoCnpf0t7BsJvA2M3sjpL4+CPxAUl44xuwu9xiZAZxM9Hz988DZwHO9+EgnAucTDaqxRtJPzKwlafklwAlEg22MAVYBdx5mnzcTPeP+qfCc/EuS/m5mDd1tIOkS4AvA+8xsT/juqiBKormQ6Bn6s4F/BRZKmk6UKvtV4F1m1iDpprCPW8Nuq81sZtj/VmByeER3+OF/LfHkwX6wfwEeMrNGgHBm7skFwKkdZ0OirK4pQDPwkpm9AWBmGyRVS5pBFFRLzayn7K+XzGxzaMMy4Bh6F+x/MbMmoEnSznDszUnL3wHca1Hd+62SnkxhnxcAF0r6UnhfDEwiSojpylxgFnBBSP3t8GczM0krgB1mtgJA0kqiz3sU0ZfQ8+HLoRBYkLT975OmlwO/lfRH4I8pfIZY8mBPTStv3fIUJ80X8G9m9njyypLOAzqf6X4BXA2M5fBnz6ak6TZ6/nfqrm1Hup9U9yvgw2a2JsX9vA4cC0wFkhN+OtrW3qmd7aGdbcB8M/toN/tN/v2+n+iL64PAzZJOSRpMwwV+z36wZ4CLJSUklRL95wHYAJwWpi9NWv9x4HOK8tmRNFVSSTf7fogo0+30sF2mJLftw0e47TPA5aHvYRzRJf/h9vs48G8d/QjhaqUnG8P2d0s6+Qja9gJwtqTjw3FKJE3tvFK4LZpoZk8BNxFdXQ07guPEhgd7kjCW3O+Bl4kGhVwYFt1GFNRLiSqVdvgF0X3uEkmvAD+jm7OnmTUT5X3fFy6bM+UbwO2SFhGdDY/EQ0R59quAuzn4Mrm7/X6TaOy55eGS+5uHO4iZvQp8DLhf0nGpNMzMdhFdCd0raXloW1ej4gwBfhNuB5YCPwwDa7hOPMW1B5JuAerN7LYM7CuPqMPvMjNbe7j1cyH0hD9iZg8cbl038PiZvQ9IOglYBzzRXwPdDX5+Zs8RSacQjeKSrMnM5nSz/kPA5E6zb+rcOZgLkt4DfKfT7DfM7JJctMd1zYPduZjwy3jnYsKD3bmY8GB3LiY82J2Lif8PXJB3bE1Kqo8AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.relplot(\n", + " data=p66_cd47_df,\n", + " x=\"query_n_unique_kmers\",\n", + " y=\"jaccard\",\n", + " height=3,\n", + " hue=\"ksize\",\n", + " palette=\"crest\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e94664e6-fe0c-45ec-9fd5-9b3cac851fa9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1498b57-8a4f-4d45-81be-61ed007cdb53", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21b8532b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}