diff --git a/notebooks/16-ced9-bcl2-and-p66-cd47-dayhoff (1).ipynb b/notebooks/16-ced9-bcl2-and-p66-cd47-dayhoff (1).ipynb
new file mode 100644
index 0000000..1c04381
--- /dev/null
+++ b/notebooks/16-ced9-bcl2-and-p66-cd47-dayhoff (1).ipynb
@@ -0,0 +1,3982 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "ec46447c-5e3e-4d2c-9e28-5d1f7860deaf",
+ "metadata": {},
+ "source": [
+ "# Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "02d8e1bf-6ca0-40a0-9ef3-af317592cc82",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:31.268053Z",
+ "iopub.status.busy": "2024-11-12T21:31:31.267726Z",
+ "iopub.status.idle": "2024-11-12T21:31:32.639537Z",
+ "shell.execute_reply": "2024-11-12T21:31:32.639214Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:31.268039Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "\n",
+ "# Handwritten local modules\n",
+ "from sig2kmer import degenerate_protein_chatgpt\n",
+ "\n",
+ "import sourmash"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc6e9b32-0acd-4727-9ba3-261edafe783e",
+ "metadata": {},
+ "source": [
+ "## Try aligning dayhoff versions of CED9 and BCL2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "e63be64c-3bd8-4bea-bf3c-2284addf8046",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:32.640388Z",
+ "iopub.status.busy": "2024-11-12T21:31:32.640079Z",
+ "iopub.status.idle": "2024-11-12T21:31:32.642225Z",
+ "shell.execute_reply": "2024-11-12T21:31:32.641997Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:32.640376Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "ced9_seq = \"MTRCTADNSLTNPAYRRRTMATGEMKEFLGIKGTEPTDFGINSDAQDLPSPSRQASTRRMSIGESIDGKINDWEEPRLDIEGFVVDYFTHRIRQNGMEWFGAPGLPCGVQPEHEMMRVMGTIFEKKHAENFETFCEQLLAVPRISFSLYQDVVRTVGNAQTDQCPMSYGRLIGLISFGGFVAAKMMESVELQGQVRNLFVYTSLFIKTRIRNNWKEHNRSWDDFMTLGKQMKEDYERAEAEKVGRRKQNRRWSMIGAGVTAGAIGIVGVVVCGRMMFSLK\"\n",
+ "bcl2_seq = \"MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAPGIFSSQPGHTPHPAASRDPVARTSPLQTPAAPGAAAGPALSPVPPVVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYGPSMRPLFDFSWLSLKTLLSLALVGACITLGAYLGHK\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "1408cb59-5dfb-4d13-843e-8b1deda6e0a0",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:32.642586Z",
+ "iopub.status.busy": "2024-11-12T21:31:32.642487Z",
+ "iopub.status.idle": "2024-11-12T21:31:32.649951Z",
+ "shell.execute_reply": "2024-11-12T21:31:32.649731Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:32.642578Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'ebdabbccbebcbbfdddbebbbcedcfebedbbcbbcfbecbcbccebbbbdcbbbddebebcbecbdeccfccbdececbfeecffbddedccbecffbbbbebabecbcdceedeebbefcdddbccfcbfacceebebdebfbefcceedbebcbcbccabebfbdeebeebfbbfebbdeecbececbcedcefefbbefedbdedccfdcdcdbfccfebebdcedccfcdbcbcdebdddccddfbeebbbebbbbebeebeeeabdeefbed'"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ced9_seq_dayhoff = degenerate_protein_chatgpt(ced9_seq, \"dayhoff\")\n",
+ "ced9_seq_dayhoff"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "a6bc3196-4903-4b1f-9c75-aa543227691f",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:32.650626Z",
+ "iopub.status.busy": "2024-11-12T21:31:32.650521Z",
+ "iopub.status.idle": "2024-11-12T21:31:32.654521Z",
+ "shell.execute_reply": "2024-11-12T21:31:32.654306Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:32.650617Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'ebdbbdbbfccdceeedfedfdebcdbfcfcbbcebbbbbbbbbbbbefbbcbbdbbdbbbbdcbebdbbbecbbbbbbbbbbbbebbebbeedebedcbbccfbddfddcfbcebbcedebbfbbdbdfbbeeccefdcbecfbdeebffcfbbeeaecbecdcebbeeccebefebcfecddedbfecccbbfcbfecefbbbedbefcfbfebedbeebebeebbaebebbfebdd'"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bcl2_seq_dayhoff = degenerate_protein_chatgpt(bcl2_seq, \"dayhoff\")\n",
+ "bcl2_seq_dayhoff"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "7be79de6-4856-4cdd-99c5-2e6b1902fd51",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:32.654897Z",
+ "iopub.status.busy": "2024-11-12T21:31:32.654799Z",
+ "iopub.status.idle": "2024-11-12T21:31:32.658857Z",
+ "shell.execute_reply": "2024-11-12T21:31:32.658627Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:32.654888Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def kmerize(sequence, ksize):\n",
+ " kmers = [sequence[i : (i + ksize)] for i in range(len(sequence) - ksize + 1)]\n",
+ " return kmers\n",
+ "\n",
+ "\n",
+ "def calculate_jaccard(set1, set2):\n",
+ " union = set1.union(set2)\n",
+ " intersection = set1.intersection(set2)\n",
+ " return len(intersection) / len(union)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "07af846e",
+ "metadata": {},
+ "source": [
+ "# The ksizes value was dropped from (5,31) using the hp dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "df3a181f-6ec3-429e-86a2-d2f7f490852a",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:32.659221Z",
+ "iopub.status.busy": "2024-11-12T21:31:32.659126Z",
+ "iopub.status.idle": "2024-11-12T21:31:32.675748Z",
+ "shell.execute_reply": "2024-11-12T21:31:32.675523Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:32.659212Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " query | \n",
+ " match | \n",
+ " moltype | \n",
+ " ksize | \n",
+ " jaccard | \n",
+ " query_n_kmers | \n",
+ " query_n_unique_kmers | \n",
+ " match_n_kmers | \n",
+ " match_n_unique_kmers | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 2 | \n",
+ " 0.818182 | \n",
+ " 238 | \n",
+ " 28 | \n",
+ " 279 | \n",
+ " 32 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 3 | \n",
+ " 0.606061 | \n",
+ " 237 | \n",
+ " 100 | \n",
+ " 278 | \n",
+ " 112 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 4 | \n",
+ " 0.198113 | \n",
+ " 236 | \n",
+ " 172 | \n",
+ " 277 | \n",
+ " 209 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 5 | \n",
+ " 0.055046 | \n",
+ " 235 | \n",
+ " 204 | \n",
+ " 276 | \n",
+ " 256 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 6 | \n",
+ " 0.016667 | \n",
+ " 234 | \n",
+ " 215 | \n",
+ " 275 | \n",
+ " 273 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 7 | \n",
+ " 0.002033 | \n",
+ " 233 | \n",
+ " 219 | \n",
+ " 274 | \n",
+ " 274 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 8 | \n",
+ " 0.000000 | \n",
+ " 232 | \n",
+ " 222 | \n",
+ " 273 | \n",
+ " 273 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 9 | \n",
+ " 0.000000 | \n",
+ " 231 | \n",
+ " 223 | \n",
+ " 272 | \n",
+ " 272 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 10 | \n",
+ " 0.000000 | \n",
+ " 230 | \n",
+ " 224 | \n",
+ " 271 | \n",
+ " 271 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 11 | \n",
+ " 0.000000 | \n",
+ " 229 | \n",
+ " 225 | \n",
+ " 270 | \n",
+ " 270 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 12 | \n",
+ " 0.000000 | \n",
+ " 228 | \n",
+ " 226 | \n",
+ " 269 | \n",
+ " 269 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 13 | \n",
+ " 0.000000 | \n",
+ " 227 | \n",
+ " 226 | \n",
+ " 268 | \n",
+ " 268 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 14 | \n",
+ " 0.000000 | \n",
+ " 226 | \n",
+ " 226 | \n",
+ " 267 | \n",
+ " 267 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n",
+ "0 BCL2 ced9 dayhoff 2 0.818182 238 28 \n",
+ "1 BCL2 ced9 dayhoff 3 0.606061 237 100 \n",
+ "2 BCL2 ced9 dayhoff 4 0.198113 236 172 \n",
+ "3 BCL2 ced9 dayhoff 5 0.055046 235 204 \n",
+ "4 BCL2 ced9 dayhoff 6 0.016667 234 215 \n",
+ "5 BCL2 ced9 dayhoff 7 0.002033 233 219 \n",
+ "6 BCL2 ced9 dayhoff 8 0.000000 232 222 \n",
+ "7 BCL2 ced9 dayhoff 9 0.000000 231 223 \n",
+ "8 BCL2 ced9 dayhoff 10 0.000000 230 224 \n",
+ "9 BCL2 ced9 dayhoff 11 0.000000 229 225 \n",
+ "10 BCL2 ced9 dayhoff 12 0.000000 228 226 \n",
+ "11 BCL2 ced9 dayhoff 13 0.000000 227 226 \n",
+ "12 BCL2 ced9 dayhoff 14 0.000000 226 226 \n",
+ "\n",
+ " match_n_kmers match_n_unique_kmers \n",
+ "0 279 32 \n",
+ "1 278 112 \n",
+ "2 277 209 \n",
+ "3 276 256 \n",
+ "4 275 273 \n",
+ "5 274 274 \n",
+ "6 273 273 \n",
+ "7 272 272 \n",
+ "8 271 271 \n",
+ "9 270 270 \n",
+ "10 269 269 \n",
+ "11 268 268 \n",
+ "12 267 267 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ksizes = range(2, 15)\n",
+ "\n",
+ "lines = []\n",
+ "\n",
+ "for ksize in ksizes:\n",
+ " bcl2_dayhoff_kmers = kmerize(bcl2_seq_dayhoff, ksize)\n",
+ " ced9_dayhoff_kmers = kmerize(ced9_seq_dayhoff, ksize)\n",
+ "\n",
+ " bcl2_dayhoff_kmers_set = set(bcl2_dayhoff_kmers)\n",
+ " ced9_dayhoff_kmers_set = set(ced9_dayhoff_kmers)\n",
+ "\n",
+ " jaccard = calculate_jaccard(bcl2_dayhoff_kmers_set, ced9_dayhoff_kmers_set)\n",
+ " lines.append(\n",
+ " [\n",
+ " \"BCL2\",\n",
+ " \"ced9\",\n",
+ " \"dayhoff\",\n",
+ " ksize,\n",
+ " jaccard,\n",
+ " len(bcl2_dayhoff_kmers),\n",
+ " len(bcl2_dayhoff_kmers_set),\n",
+ " len(ced9_dayhoff_kmers),\n",
+ " len(ced9_dayhoff_kmers_set),\n",
+ " ]\n",
+ " )\n",
+ "jaccard_df = pd.DataFrame(\n",
+ " lines,\n",
+ " columns=[\n",
+ " \"query\",\n",
+ " \"match\",\n",
+ " \"moltype\",\n",
+ " \"ksize\",\n",
+ " \"jaccard\",\n",
+ " \"query_n_kmers\",\n",
+ " \"query_n_unique_kmers\",\n",
+ " \"match_n_kmers\",\n",
+ " \"match_n_unique_kmers\",\n",
+ " ],\n",
+ ")\n",
+ "jaccard_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "2fe0e852-a6e8-4b77-abf2-4081fc9ca998",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:32.676166Z",
+ "iopub.status.busy": "2024-11-12T21:31:32.676066Z",
+ "iopub.status.idle": "2024-11-12T21:31:32.809486Z",
+ "shell.execute_reply": "2024-11-12T21:31:32.809211Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:32.676157Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAANAAAADQCAYAAAB2pO90AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAANg0lEQVR4nO3df5BdZ13H8ffn0sA6+QEl2aQOJYSMoUCUSXG1HUg1pdVZMhihdEJbGWGmEmHA1Ak64oA1A9YRGTNaLQqMhYgWTP1R4hgLnUwKVdLS7TRWEqGtYVtTSTZZ2uZHXdh6v/5xzm63y+7es/vcc8+evZ/XzJ29P55797udfnLOee45z1cRgZnNTaPqAszqzAEyS+AAmSVwgMwSOEBmCWoXoP7+/gB8863TtynVLkCnTp2qugSzcbULkNl84gCZJXCAzBKcV3UB7dBsBoPD5zhxeoRVy3pYs3wxjYaqLsu6QO0D1GwGdx4+zo49hxgZbdKzqMGurRvoX3+BQ2Slq/0u3ODwufHwAIyMNtmx5xCDw+cqrsy6QakBktQv6duSHpX0oSleXy3pgKQHJT0kafNsf8eJ0yPj4RkzMtpk6MxIQuVmxZQWIEkvAG4B3gy8FrhW0msnDfsIsCciLgauAT4529+zalkPPYue/2f0LGqwcmnPnOo2m40yt0A/DTwaEUcj4gfAF4FfnDQmgGX5/RcD/zPbX7Jm+WJ2bd0wHqKxY6A1yxfPvXKzgsqcRHgZ8N8THh8DLpk0ZifwFUm/BiwGrpzqgyRtA7YBrF69+nmvNRqif/0FvHr7ZQydGWHlUs/CWedUPYlwLfC5iLgQ2Ax8XtIP1RQRn46Ivojo6+3t/aEPaTTE2t4lXLp2BWt7lzg81jFlBugJ4OUTHl+YPzfR9cAegIg4CPQAK0qsyaytygzQ/cA6Sa+U9EKySYK9k8Y8DlwBIOk1ZAE6WWJNZm1VWoAi4lngA8CXgf8km207LOmjkrbkwz4IvEfSvwNfAN4dXuXEakR1+/+1r68vBgYGqi7Dus+UB9ZVTyKY1ZoDZJbAATJL4ACZJXCAzBI4QGYJHCCzBA6QWQIHyCyBA2SWwAEyS+AAmSVwgMwSOEBmCRwgswQOkFkCB8gsgQNklsABMkvgAJklcIDMEjhAZgkcILMEDpBZAgfILIEDZJag9k2G58Jdva1dKu2Rmo/ZKumIpMOSbiuzHniuq/fmm+/h2s/cx+ab7+HOw8dpNuu1RrjND5X2SJW0Dvht4I0RsR749bLqGeOu3tZOVfdIfQ9wS0Q8CRARQyXWA7irt7VXmQGaqkfqyyaNeRXwKkn/JuleSf1TfZCkbZIGJA2cPJnWf8tdva2dqp6FOw9YB2wi65f6GUkvmTyoVY/U2XBXb2unMmfhivRIPQbcFxGjwHckPUwWqPvLKspdva2dqu6RegfZ1gdJK8h26Y6WWBPgrt7WPlX3SP0yMCzpCHAA+M2IGC6rJrN2c49Us2LcI9Ws3RwgswQOkFkCB8gsgQNklsABMkvgAJklcIDMEjhAZgkcILMEDpBZAgfILIEDZJZg2gvqJP0pMO2p2hGxvZSKzGpkpi3QAPAA0AO8Hngkv20AXlh6ZWY1MO0WKCJ2A0h6H7Axv0AOSX8B3NOZ8szmtyLHQOcDyyY8XpI/Z9b1iiwq8gfAg5IOkF2V9zPAzjKLMquLGQMkqQF8G7gkvwH8VkQcL7swszqYMUAR0ZR0S0RcDHypQzWZ1UaRY6D9kt4uyWs/mU1SJEC/CtwOfF/SaUlnJJ0uuS6zWmg5iRARSztRiFkdFVraV9L5ZEvujq/AHhFfK6sos7poGSBJvwLcQLa29SHgUuAg8KZSKzOrgSLHQDcAPwU8FhGXAxcDT5VZlFldFAnQSESMAEh6UUR8C7io3LLM6qHIMdCxvGfPHcBdkp4EHiuzKLO6aLkFioi3RcRTEbET+B3gL4G3FvnwIk2G83FvlxSS+grWbTYvtAyQpEslLQWIiK8Cd5MdB7V6X8smw/m4pWTHWffNqnKzeaDIMdCfA2cnPD6bP9dKkSbDAB8DPg64y6/VTpEAKSY0EYqIJsWOnVo2GZb0euDlEfHPMxbQxibDZu1UJEBHJW2XtCi/3UAb2jDmZ3rvAj7Yamw7mwybtVORAL0XeANZg+BjZJc1bCvwvlZNhpcCPw7cLWmQ7AvavZ5IsDopci7cEFmD4NkabzJMFpxrgOsmfO7TwIqxx5LuBn4jIty/0WqjyCzc7vx7oLHH50u6tdX7CjYZNqu1IpMBr4uIp8YeRMSTklpOY+dj9wH7Jj134zRjNxX5TLP5pMgxUCM/GxsASS+l4FncZgtdkSD8EXBQ0u1ki4pcDdxUalVmNVFkEuGvJD0AXJ4/dVVEHCm3LLN6KLQrlh/8nyS/oE7S6oh4vNTKzGqgyCzcFkmPAN8BvgoMAv9Scl1mtVBkEuFjZF9yPhwRrwSuAO4ttSqzmigSoNGIGCabjWtExAHAZwuYUewY6ClJS8gWlP8bSUPAuXLLMquHIlugLcAzZNfs3Ak8CrylzKLM6mKmBlv/GhEbgRM812hrbHXS35P0PeATEfHJkms0m7dm6g+0Mf855cKKkpYDXwccIOtac+6Rmk8sbGpfKWb1k9RkOCK+265CzOrIXbrNEjhAZgkcILMEDpBZAgfILIEDZJbAATJL4ACZJXCAzBI4QGYJHCCzBA6QWQIHyCyBA2SWoNQAteqRKmmHpCOSHpK0X9IryqxnrprN4OjJsxz8r1McPXmWZjNav8m6QmlrXE/okfpzZH2F7pe0d9Kqpg8CfRHxjKT3AX8IvKOsmuai2QzuPHycHXsOMTLapGdRg11bN9C//gIaDbX+AFvQytwCteyRGhEHIuKZ/OG9ZE245pXB4XPj4QEYGW2yY88hBoe9MJGVG6CWPVInuZ5pVjytskfqidMj4+EZMzLaZOiMeyLbPJlEkPROssUaPzHV61X2SF21rIeeRc//z9SzqMHKpT0drcPmpzID1KpHKgCSrgQ+DGyJiO+XWM+crFm+mF1bN4yHaOwYaM3yxRVXZvNBmY2yZuyRCpB3uvsU0J/3Yp13Gg3Rv/4CXr39MobOjLByaQ9rli/2BIIBJQYoIp6VNNYj9QXArWM9UoGBiNhLtsu2BLhdEsDjETHv+qc2GmJt7xLW9i6puhSbZxRRr+80+vr6YmDAjbyt46bc5ZgXkwhmdeUAmSVwgMwSOEBmCRwgswQOkFkCB8gsgQNklsABMkvgAJklcIDMEjhAZgkcILMEDpBZAgfILIEDZJbAATJLUOaaCF2r2QwGh89x4vQIq5Z5DYWFzAFqM69k2l28C9dmXsm0uzhAbeaVTLuLA9RmXsm0uzhAbeaVTLuLJxHazCuZdhcHqASzXcnU09715QBVzNPe9eZjoIp52rvequ6R+iJJf5u/fp+kNWXWMx/NZdp7tj1bPX768an9b6vukXo98GRE/Jika4CPM896pJZtbNp7Yohmmvae7S6fx08/vh27z5X2SM0f787v/x1whfI+J91ittPes93l8/jpx7dj97nqHqnjYyLiWeBpYPnkD6qyR2rZxqa9922/jC9uu4R92y+b8V/A2e7yefz049tx1kgtJhGq7JHaCWPT3peuXcHa3iUz7j7M9kwHj59+fDvOGqm6R+r4GEnnAS8GhkusqfZmu8vn8dOPb8dZI6V1qMsD8TBwBVlQ7geui4jDE8a8H/iJiHhvPolwVURsnelz3aHuuS9ei57p4PHTj5/F2KmfLLPFo6TNwB/zXI/Umyb2SJXUA3weuBj4HnBNRByd6TMdIKtI5wNUBgfIKuIeqWbt5gCZJajdLpykk8BjVddR0ArgVNVFdNBC/ntPRUT/5CdrF6A6kTQQEX1V19Ep3fb3gnfhzJI4QGYJHKByfbrqAjqs2/5eHwOZpfAWyCyBA2SWwAEqiaRBSf8h6ZCkBXfukaRbJQ1J+uaE514q6S5Jj+Q/z6+yxk5wgMp1eURsWKDfjXwOmPzF4oeA/RGxDtifP17QHCCbk4j4GtkZ9BNNvER/N/DWTtZUBQeoPAF8RdIDkrZVXUyHrIqI7+b3jwOrqiymE7ywYnk2RsQTklYCd0n6Vv6vdleIiJC04L8j8RaoJBHxRP5zCPhHslWKFroTkn4UIP85VHE9pXOASiBpsaSlY/eBnwe+OfO7FoS9wLvy++8CvlRhLR3hMxFKIGkt2VYHst3k2yLipgpLajtJXwA2kV3CcAL4XeAOYA+wmuySk60RMXmiYUFxgMwSeBfOLIEDZJbAATJL4ACZJXCAzBI4QDUmac3Es6FbjP162fV0IweoS0TEG6quYSFygBYISWslPSjpZyV9I78O6SFJ6/LXz+Y/P5q/dkjSE5I+mz//zgnv+1TeYdBacIAWAEkXAX8PvBu4GviTiNgA9JE1NhsXETfmr20iuxzhzyS9hqy15hvz1/4P+KXOVF9vPhu7/nrJzjm7KiKOSDoIfFjShcA/RMQjk9+Qt9H8a2BXRDwg6QPAT5L1sQX4EbrgRNB28Bao/p4GHgc2AkTEbcAW4H+BfZLeNMV7dgLHIuKz+WMBu/OrZzdExEURsbP0yhcAB6j+fgC8DfhlSdflJ7IejYibybZMr5s4WNIvAFcC2yc8vR+4Or92aWxtg1d0pPqa8y7cAhAR5yS9BbgL+CfgHZJGya4K/f1Jw3eQNXf+Rr67tjcibpT0EbIraBvAKPB+6rOIf2V8NrZZAu/CmSVwgMwSOEBmCRwgswQOkFkCB8gsgQNkluD/AeDcPj5PCDaEAAAAAElFTkSuQmCC",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.relplot(data=jaccard_df, x=\"ksize\", y=\"jaccard\", height=3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "e3d860cc-45c8-4b94-9d00-fc15f874276b",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:32.809965Z",
+ "iopub.status.busy": "2024-11-12T21:31:32.809855Z",
+ "iopub.status.idle": "2024-11-12T21:31:32.888595Z",
+ "shell.execute_reply": "2024-11-12T21:31:32.888321Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:32.809955Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAANEAAADRCAYAAABSOlfvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAT20lEQVR4nO3dfZRcdX3H8fdncHVxkwh5piFhCSfVhqMGWBWVeEKtdk3VoGiEHkU4HGN7eDyISoVaWquHUkzbeFrlsYAnPqQoEBUjEFHAIrCGhbBBDYRVSfO4WvJAl2yYb/+4v0kmy8zu/Hbm7syd+b7OuWfu/c29M99J5ru/e+/8HmRmOOfGLlfvAJzLOk8i56rkSeRclTyJnKuSJ5FzVcp8EnV3dxvgiy/jsZSU+STauXNnvUNwLS7zSeRcvXkSOVclTyLnqvSKegeQhnze6B/Yy7Zdg8yY1E7nlA5yOdVtf9fcmi6J8nljTd9WLlnVy+BQnva2HMuXLqD7+Jklv+jjsb8naHNT1hugdnV1WU9Pz4HtTTv2sHjFAwwO5Q+UtbfluOvChcydNuFlx6e5f9oJWjjGk27clPyHbbprom27Bg/5ggMMDuXZvntw3PfvH9h7ICEK+12yqpf+gb0lXzt2/0LSLV7xAGde/zCLVzzAmr6t5POl/zDm88amHXt46JmdbNqxp+x+Lk7TJdGMSe20tx36sdrbckyf2D7u+6ed0DFJF5twrnJNl0SdUzpYvnTBgS964ZSoc0rHuO+fdkKnWSuC11yVarobC7mc6D5+Jq+7cCHbdw8yfeLI1wlp7l9IuOHXOKMlaKX7F5Ju+PVZbK1Y6tpvLNdnrarpbiw0msKFfyUJGrt/zBc97RsuLaLkf0TT1USNJpcTc6dNqPiLF7N/mrVibM0FrXun0JMo4ypNutjT1phTRWjt07+mu7Hgyisk3MlzpzJ32oQRv9yxN1zGcuOiWXhN5EqKrbnGcvrXLDyJXFkx12exp3/NxE/nXE3Env41E6+JXE3Env41E08iVzOxt/ObhSeRq4tm+k3Jk8iNu2b7TclvLLhx12y/KaWaRJJmS7pP0gZJfZIuCuWTJd0jaWN4PDKUS9IKSU9LekLSiWnG5+ojtstHo0u7JtoPfMrM5gMnA+dJmg9cBqw1s3nA2rAN8B5gXliWAV9NOT5XB7FdPhpdqklkZlvMbF1Y3w08BcwClgC3hN1uAU4L60uAWy3xc+AISUelGaMbf832m9KYbixIygETzGxXxDGdwAnAw8AMM9sSntoKzAjrs4DfFR32XCjbUlSGpGUkNRVz5swZwydw9dRsvylVXBNJ+oakSZI6gCeBDZI+XeGxE4DvABcPTzxLOjRFdWoys+vMrMvMuqZNmxZzqGsQMY1hG13M6dz8kACnAT8EjgU+NtpBktpIEmilmX03FG8rnKaFx+2hfDMwu+jwo0OZcw0rJonaQkKcBqw2syFGqUEkCbgReMrMlhc9tRr4eFj/OHBnUflZ4S7dycDzRad9zjWkmGuirwH9wOPA/ZKOAUa7Jno7SW21XlJvKPsccBWwStK5wG+ApeG5u4DFwNPAC8A5EfE5VxcVJVG4kbDNzGYVlf0WOHWk48zsQcr0SwfeWWJ/A86rJCbnGkVFp3Nmlgc+M6zMzGx/KlE5lyEx10T3Sro0tEKYXFhSi8y5jIi5JvpIeCw+3TJgbu3CcS57Kk4iMzs2zUCcy6qYH1tfLekKSdeF7XmS3pteaM5lQ8w10X8C+4C3he3NwD/WPCLnMiYmiY4zs6uBIQAze4Hyt6+daxkxSbRP0uGEVgqSjgNeTCUq5zIk5u7c3wFrgNmSVpK0Rjg7jaCcy5KYu3P3SFpH0rlOwEVmtjO1yJzLiNhOebOAw4BXAu+Q9MHah+RctlRcE0m6CXgD0AcUOsgb8N2yBznXAmKuiU4OYyU4N+4aeZy6mCR6SNJ8M9uQWjTOldDo49TFXBPdSpJIvwrDWa2X9ERagTlX0Ojj1MXURDcSOthx8JrIudQ1+txHMUm0w8xWpxaJc2U0+txHMadzj4URf86U9MHCklpkzgWNPk5dTE10OEkzn3cXlfktbpe6Rh+nLiaJPmVmvy8ukOR9jNy4aOS5j2JO574naVJhQ9KfAN+rfUjOZUtMEn2JJJEmSDoJuA34aDphOZcdMQ1QfxAGb7wbmAh8wMx+nVpkzmXEqEkk6SscOtLpa4BngPMlYWYXphWcc1lQSU3UM2z7F2kE4lxWjZpEZnbLaPsASPqOmZ1efUjOZUstJ/ny8edcS6plEkXNMeRcs/DZw52rUi2T6GVtMCTdJGm7pCeLyq6UtFlSb1gWFz33N2Hm8F9J+vMaxuZcaqKSSNLhkl5b5unPlii7GeguUf4vZrYgLHeF154PnAEcH475D0mHxcTnXD3EDCP8PqCXZNgsJC2QdKBrhJndPfwYM7sf+P3w8jKWAN8ysxfN7FmSib7eXGl8ztVLTE10JcmX+n8BzKyXZN7WsTg/9I69SdKRoazczOEvI2mZpB5JPTt27BhjCM7VRkwSDZnZ88PKxnJH7qvAccACYAvw5dgX8NnDXSOJSaI+SX8JHBZmhPgK8N+xb2hm28zspTD73vUcPGXzmcNdJsUk0QUkF/0vAt8kmfT44tg3lHRU0eYHgMKdu9XAGZJeFfopzQMeiX1958ZbTCvuF4DLw1IRSd8EFgFTJT1HMp73IkkLSE4F+4FPhtfvk7QK2ADsB84zs5cqfS/n6kXJhN0V7CjdR4lrIDP701oHFaOrq8t6eoa3kXUuFSX7o8d0D7+0aL0dOJ2kxnCupcWczg3vAvEzSX7N4lpezID2k4s2c8BJJB30nGtpMadzvyC5JhLJadyzwLlpBOVclsSczvnwWM6VEHM6N+Jop2bmgzi6lhRzOncu8Dbgx2H7VJIWCzvwkVBdC4tJojZgvpltgQMtD242s3NSicy5jIhp9jO7kEDBNmBOjeNxLnNiaqK1kn5E0m4O4CPAvbUPyblsibk7d364ubAwFF1nZrenE5Zz2RFTExXuwPkNBOeKVDKM8INmdoqk3RzaAFWAmdmkMoc61xIqGQH1lPA4Mf1wnMueqNO5MPrOjOLjzOy3tQ7KuSyJabFwAUmnum0cnD3cgDekEJdzmRFTE10EvNbMBtIKxrksivmx9XfA8NF+nGt5MTXRJuAnkn5AMlgJAGa2vOZROZchMUn027C8MizOOeJaLPx9moE4l1Uxd+cacrQf5+rNR/txrko+2o9zVfLRfpyrko/241yVajbaj6R3mdk91YfkXLbUcs7Wf6rhazmXGfWY+HiypHskbQyPR4ZySVoRJj5+QtKJNYzNtZh83ti0Yw8PPbOTTTv2kM+PZT66ytQyiUpFeTMvn/j4MmCtmc0D1oZtgPeQzEk0D1hGMqOec9HyeWNN31YWr3iAM69/mMUrHmBN39bUEqmWSfQyZSY+XgLcEtZvAU4rKr/VEj8Hjhg2IZhzFekf2Mslq3oZHEp67AwO5blkVS/9A3tTeb9aJlF/hfvNKBp6aytJJz/wiY9djWzbNXgggQoGh/Js3z2YyvvF9mx9G9DJoT1bbw2PIw4zXIqZmaToOtbMrgOug2SSr9jjXXObMamd9rbcIYnU3pZj+sT2VN6v4ppI0teBa4BTgDeFpWsM77mtcJoWHreHcp/42NVE55QOli9dQHtb8vVub8uxfOkCOqd0pPJ+MTVRF8kwwtX+5V8NfBy4KjzeWVR+vqRvAW8Bnh824qpzFcnlRPfxM3ndhQvZvnuQ6RPb6ZzSQS5XcrbIqsUk0ZPATKDiL3aZiY+vAlZJOhf4DbA07H4XsBh4GngB8DG+3ZjlcmLutAnMnTYh9feKSaKpwIbQ6LS4Z+v7yx1gZmeWeeqdJfY14LyIeJxrCDFJdGVaQTiXZTFt53460vOSHjKzt1YfknPZUsvfidK5f+hcg0u72Y9zTS/VZj/OtYJUW3E71wpiWixcUOi2UMbHahCPc5kTUxPNAB6VtEpSt6RDah4ze7LMcc41tYqTyMyuIOnrcyNwNrBR0pckHZdSbM5lQtQ1UWhVsDUs+4EjgdskXZ1CbM5lQsyQWRcBZwE7gRuAT5vZkKQcsBH4TDohOtfYYpr9HAl80Mx+U1xoZnlJ761tWM5lR0Wnc2GayTOGJ1CBmT1V06icy5CKksjMXgJ+JWlOyvE4lzmxp3N9oSvEgREfRuoK4VwriEmiv00tCucyLKorhKRjgHlmdq+kVwOHpReac9kQ0+znE8BtwLWhaBZwRwoxOZcpMT+2nge8HdgFYGYbgelpBOVclsQk0Ytmtq+wIekVeB8i56KS6KeSPgccLuldwH8B30snLOeyIyaJLgN2AOuBT5IMcXVFGkE5lyUxd+fywPVhcc4FMQ1Qn6XENZCZza1pRM5lTOwwwgXtwIeByWX2da5lxHTKGyhaNpvZvwJ/kV5ozmVDzOlc8fSPOZKaKWpqFueaUUwSfJmD10T7SSb1+nCtA3Iua2KS6PskSVQYoMSA9xbGKzGz5bUNzblsiEmik0gm9rqTJJHeBzxC0jU8mqR+YDfwErDfzLokTQa+TTIbXz+w1Mz+MJbXd268xCTR0cCJZrYbQNKVwA/M7KNVvP+pZrazaLsws/hVki4L25+t4vWdS13suHP7irb3cXDS4lopN7O4cw0rpia6FXhE0u1h+zTg5ire24C7w8TH14bJjMvNLH4IScuAZQBz5niPdVdfipmCNdzmXhg27zezx8b8xtIsM9ssaTpwD3ABsNrMjija5w9mNtLQxXR1dVlPT89Yw3AuRsnx5qN+5zGzdcC6WkRjZpvD4/ZQu72ZMLO4mW0ZNrO4cw2rLlOrSOqQNLGwDrybZGLlwszicOjM4s41rHq1OJgB3B5+Y3oF8A0zWyPpUUrPLO5cw6pLEpnZJuCNJcoHKDGzuHONzGfKc65KnkTOVcmTyLkqeRI5VyVPIueq5EnkXJU8iZyrknfvdi0vnzf6B/aybdcgMya10zmlg1yuZDO5kjyJXEvL5401fVu5ZFUvg0N52ttyLF+6gO7jZ1acSH4651pa/8DeAwkEMDiU55JVvfQP7B3lyIM8iVxL27Zr8EACFQwO5dm+e7Di1/Akci1txqR22tsOTYP2thzTJ7ZX/BqeRK6ldU7pYPnSBQcSqXBN1Dmlo+LX8BsLrqXlcqL7+Jm87sKFbN89yPSJfnfOuWi5nJg7bQJzp00Y2/E1jse5luNJ5FyVPImcq1LUkFmNSNIOkvEYms1UYOeoe2Vflj7nTjPrHl6Y+SRqVpJ6zKxr9D2zrRk+p5/OOVclTyLnquRJ1Liuq3cA4yTzn9OviZyrktdEzlXJk8i5KnkS1YGk2ZLuk7RBUp+ki0L5FyQ9IalX0t2S/iiUS9IKSU+H508c+R0axxg+6yJJz4fyXkmfr+8nqICZ+TLOC3AUydSdABOBXwPzgUlF+1wIfC2sLwZ+SDI/zsnAw/X+DCl+1kXA9+sdd8ziNVEdmNkWS+Z6wpI5cJ8CZpnZrqLdOkhmE4RkGs5bLfFz4Igwf1PDG8NnzRzvClFnkjqBE4CHw/YXgbOA54FTw26zgN8VHfZcKNtChlT4WQHeKulx4H+AS82sb5xDjeI1UR1JmgB8B7i48JfZzC43s9nASuD8esZXSxGfdR1wjJm9EfgKcEcdwo3iSVQnktpIvlQrzey7JXZZCZwe1jcDs4ueOzqUZULMZzWzXWa2J6zfBbRJmjpuwY6BJ1EdKJki8EbgKTNbXlQ+r2i3JcAvw/pq4Kxwl+5k4Hk7OMt6Q4v9rJJmhmOQ9GaS7+jA+EUcz6+J6uPtwMeA9ZJ6Q9nngHMlvRbIk3Tv+Kvw3F0kd+ieBl4AzhnXaKsT+1k/BPy1pP3A/wFnWLht16i82Y9zVfLTOeeq5EnkXJU8iZyrkieRc1XyJHKuSp5EzlXJk8i9jKT+Rm8l0Eg8iTJIUqZ/JM96/MN5Eo0DSZdL+rWkByV9U9Klkn4iqSs8P1VSf1g/TNI/S3o0dFr7ZChfJOkBSauBDZL+QdLFRe/xxUKHtxLvvyi8322SfilpZaFpzShxHy7ph5I+IakzHHtz+CwrJf2ZpJ9J2hia6CCpQ9JNkh6R9JikJaH8bEmrJf0YWCvpKEn3h453T0paWN2/ch3Vu0NTsy/AScB64NXAJJKmO5cCPwG6wj5Tgf6wvgy4Iqy/CugBjiXprLYXODY81wmsC+s54BlgSpkYFpF0Nzg67PsQcMoIMfeH178XOKvo/fYDrw+v8QvgJpKOgkuAO8J+XwI+GtaPIOmE1wGcTdKFY3J47lPA5WH9MGBivf+vxro0VbXaoBYCt5vZCwChJhnJu4E3SPpQ2H4NMA/YBzxiZs8CmFm/pAFJJwAzgMfMbKSGmo+Y2XMhhl6SpHhwhP3vBK42s5VFZc+a2frwGn3AWjMzSevD6xXif7+kS8N2OzAnrN9jZr8P648CN4UW3neYWe8IsTQ0P52rn/0c/PcvnttQwAVmtiAsx5rZ3eG54bPx3kDyF/4cklphJC8Wrb/E6I2PfwZ0DzvtK36NfNF2vuj1BJxeFP8cM3tqePxmdj/wDpIuHTdLOmuUeBqWJ1H67gdOC9cXE4H3hfJ+klM9SFouF/yIpBVzG4CkP5ZUbu7D24Fu4E3huFr6PPAH4N8jj/sRcEFRd4YTSu0k6Rhgm5ldT/LHIDODrwznSZQyS8YX+DbwOMlgI4+Gp64hSZbHSK6JCm4ANgDrJD0JXEuZWsPM9gH3AavM7KUUwr8IOFzS1RHHfAFoA54Ip3xfKLPfIuDx8Pk/AvxbNYHWk3eFGGeSrgT2mNk1NXitHEl36g+b2cZqX8+NjddEGSVpPsmdvrWeQPXlNVETkfR64OvDil80s7eU2f92ktvnxT5rZrW+vmpqnkTOVclP55yrkieRc1XyJHKuSp5EzlXp/wHBQ+FyS9L9xQAAAABJRU5ErkJggg==",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.relplot(data=jaccard_df, x=\"query_n_kmers\", y=\"query_n_unique_kmers\", height=3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "8a341b04-2470-443f-8758-9df1783e79cf",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:32.889054Z",
+ "iopub.status.busy": "2024-11-12T21:31:32.888946Z",
+ "iopub.status.idle": "2024-11-12T21:31:32.965630Z",
+ "shell.execute_reply": "2024-11-12T21:31:32.965373Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:32.889044Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAANEAAADQCAYAAACZZoRKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAARjUlEQVR4nO3dfbBcdX3H8fdn8bYX81BJSIITiTFMBoUZm8qtpQgtaLUxRUHUINbHYYp/8OTEJ6pUaad1qFPTNnZqhYpA60NTH0OlqTG1ghXFgCkQrI3GK5IhNw9aEkIvXNhv/zhnw2bZm9zfPXvu7tn9vGZ2ds/Zc85+b2a/Ob9z9vf7fRURmNn01bodgFnVOYnMCnISmRXkJDIryElkVlDlk2jlypUB+OHHTDzaqnwS7d27t9sh2ICrfBKZdZuTyKwgJ5FZQc/odgBmU1GvB6P7DjK2f5xFc4dZOn8WtZo6sn3qsVs5iaxjyvri1uvBxm27WLN+K+MTdYaHaqxdvYKVp57Qdp+U7VOP3Y6q3gF1ZGQktmzZ0u0w+lJZX/TUL+6OPY+wat3tjE/UD60bHqpx6xVnsWzB7ELbJx677R/va6IBUq8HO/Y8wh0/3suOPY9Qr0/+H2jji75q3e1cdP13WbXudjZu2zXpPqP7Dh5KCoDxiTpr1m9ldN/BQtsCjO0fP+xL3thn94HxwtunHrsdJ9GAKDMpoNwv7qK5wwwPHf5VHR6qsXDOcOHtU4/djpOo4qZ6din7f/8yv7hL589i7eoVh/ZpNP+Wzp9VePvUY7fjGwsVlnJtcaSkaHdd0fiit14rHO2L3hrLkb64U9kWoFYTK089gedfcRa7D4yzcM6Rr89Stk89dju+sVBhJV5AT+uuVeNGxFS+jCnb9pC2AfpM1GNS7oilnF3K/t+/sc+yBbPbJmWRbXudk6iHpP7vn9LkKjspBplvLPSQ1Iv/1IviRlKcvux4li2YXYXmUyX4TNRDUi/+O3FRbMU5iXpI6h0xcJOrF7g510M68ZuFzTyfiXqIm2fV5CTqMW6eVY+TqGRFx6pY73MSlagTY1Ws9/nGQolSf/exaio1iSSdKOkbku6XtE3Slfn6eZI2SdqePx+Xr5ekdZJ+JOkeSS8qM76ydWKsivW+ss9ETwDviohTgNOBSyWdAlwFbI6I5cDmfBnglcDy/HEJ8PGS4ytVJ8aqWO8rNYki4qGIuDt/fQD4AbAYOA+4Kd/sJuD8/PV5wM2R+Q7wLEnPLjPGMvl3n8EwrRsLkmrA7IjYn7DPUuDXgO8CiyLiofytXcCi/PVi4GdNuz2Yr3uoaR2SLiE7U7FkyZJp/AUzw7/7DIYpn4kkfUbSXEmzgPuA+yW9Z4r7zga+ALyzNfEiG9CUNKgpIq6LiJGIGFmwYEHKrjPOnT77X0pz7pQ8Ac4H/hV4HvDmo+0kaYgsgT4dEV/MV481mmn58+58/U7gxKbdn5OvM+tZKUk0lCfE+cCGiJjgKGcQSQI+CfwgItY2vbUBeGv++q3AV5rWvyW/S3c68HBTs8+sJ6VcE/0dMAr8F3CbpOcCR7smegnZ2epeSVvzde8HrgXWS7oY+CmwOn/vVmAV8CPgUeDtCfGZdcWUkii/kTAWEYub1j0AnHOk/SLiW0wyLh14WZvtA7h0KjGZ9YopNeciog68t2VdRMQTpURlViEp10Rfl/TuvBfCvMajtMjMKiLlmujC/Lm5uRXAss6FY1Y9U06iiHhemYGYVVXKj63PlHS1pOvy5eWSzi0vNLNqSLkm+hTwOHBGvrwT+NOOR2RWMSlJdFJEfASYAIiIR5n89rXZwEhJosclHUveS0HSScBjpURlViEpd+c+BGwETpT0abLeCG8rIyizKkm5O7dJ0t1kg+sEXBkRe0uLzKwiUgflLQaOAX4J+C1JF3Q+JLNqmfKZSNINwAuBbUBj4oAAvjjpTmYDIOWa6PR8roSB57nkrFlKEt0h6ZSIuL+0aCrAc8lZq5RropvJEumH+XRW90q6p6zAepXnkrNWKWeiT5IPsOOpa6KBk1pDyPpfShLtiYgNpUVSEdOpIWT9LaU59/18xp+LJF3QeJQWWY/yXHLWKuVMdCxZN59XNK0buFvcnkvOWqUk0bsi4ufNKyQN5Bgj1xCyZinNuVskzW0sSHoBcEvnQzKrlpQk+jBZIs2WdBrweeBN5YRlVh0pHVC/mk/e+DVgDvCaiPif0iIzq4ijJpGkj3H4TKe/AvwYuEwSEXFFWcGZVcFUzkRbWpbvKiMQs6o6ahJFxE1H2wZA0hci4rXFQzKrlk4W+fL8czaQOplESTWGzPqFq4ebFdTJJHpavxdJN0jaLem+pnXXSNopaWv+WNX03h/mlcN/KOl3OxibWWmSkkjSsZJOnuTt97VZdyOwss36v4yIFfnj1vzYpwBvAE7N9/lbScekxGfWDSnTCL8K2Eo2bRaSVkg6NDQiIr7Wuk9E3Ab8vHX9JM4DPhcRj0XET8gKfb14qvGZdUvKmegasi/1/wJExFayuq3TcVk+OvYGScfl6yarHP40ki6RtEXSlj179kwzBLPOSEmiiYh4uGXddO7IfRw4CVgBPAR8NPUAVaoebv0vJYm2SXojcExeEeJjwLdTPzAixiLiybz63vU81WRz5XCrpJQkupzsov8x4LNkRY/fmfqBkp7dtPgaoHHnbgPwBkm/nI9TWg7cmXp8s5mW0ov7UeAD+WNKJH0WOBs4XtKDZPN5ny1pBVlTcBR4R378bZLWA/cDTwCXRsSTU/0ss25RVrB7ChtK36DNNVBEvLTTQaUYGRmJLVta+8ialaLtHAApw8Pf3fR6GHgt2RnDbKClNOdah0D8pyRfs9jAS5nQfl7TYg04jWyAntlAS2nO3UV2TSSyZtxPgIvLCMqsSlKacwM5PZbZ0aQ0544422lEDNQkjmYNKc25i4EzgH/Pl88h67GwhwGcCdWsISWJhoBTIuIhONTz4MaIeHspkZlVREq3nxMbCZQbA5Z0OB6zykk5E22W9G9k/eYALgS+3vmQzKol5e7cZfnNhbPyVddFxJfKCcusOlLORI07cL6BYNZkKtMIfysizpR0gMM7oAqIiJg7ya5mA2EqM6CemT/PKT8cs+pJas7ls+8sat4vIh7odFBmVZLSY+FyskF1YzxVPTyAF5YQl1llpJyJrgROjoh9ZQVjVkUpP7b+DGid7cds4KWciXYA/yHpq2STlQAQEWs7HpVZhaQk0QP545fyh5mR1mPhj8sMxKyqUu7O9eRsP2bd5tl+zArybD9mBXm2H7OCPNuPWUEdm+1H0ssjYlPxkMyqpZM1W/+8g8cyq4xuFD6eJ2mTpO3583H5eklalxc+vkfSizoY2xHV68GOPY9wx4/3smPPI9Tr06ldZoOqk0nU7pt3I08vfHwVsDkilgOb82WAV5LVJFoOXEJWUa909XqwcdsuVq27nYuu/y6r1t3Oxm27nEg2ZZ1MoqeZpPDxecBN+eubgPOb1t8cme8Az2opCFaK0X0HWbN+K+MT2eiO8Yk6a9ZvZXTfwbI/2vpEJ5NodIrbLWqaemsX2SA/6FLh47H944cSqGF8os7uA+OFjmuDI3Vk6xnAUg4f2Xpz/nzEaYbbiYiQlNxuiojrgOsgK/KVun+zRXOHGR6qHZZIw0M1Fs4ZLnJYGyBTPhNJ+gfgL4AzgV/PHyPT+MyxRjMtf96dr+9K4eOl82exdvUKhoeyf4rhoRprV69g6fxZZX+09YmUM9EI2TTCRa+4NwBvBa7Nn7/StP4ySZ8DfgN4uGXG1VLUamLlqSfw/CvOYveBcRbOGWbp/FnUam0rC5o9TUoS3QecAEz5iz1J4eNrgfWSLgZ+CqzON78VWAX8CHgUmLE5vms1sWzBbJYtmD1TH2l9JCWJjgfuzzudNo9sffVkO0TERZO89bI22wZwaUI8Zj0hJYmuKSsIsypL6Tv3zSO9L+mOiPjN4iGZVUsnfyfyPWEbSGV3+zHre6V2+zEbBKX24jYbBCk9Fi5vDFuYxJs7EI9Z5aSciRYB35O0XtJKSYedeSLivkn2M+trU06iiLiabKzPJ4G3AdslfVjSSSXFZlYJSddEea+CXfnjCeA44POSPlJCbGaVkDJl1pXAW4C9wN8D74mICUk1YDvw3nJCNOttKd1+jgMuiIifNq+MiLqkczsblll1TKk5l5eZfENrAjVExA86GpVZhUwpiSLiSeCHkpaUHI9Z5aQ257blQyEOzeJxpKEQZoMgJYn+qLQozCosaSiEpOcCyyPi65KeCRxTXmhm1ZDS7ecPgM8Dn8hXLQa+XEJMZpWS8mPrpcBLgP0AEbEdWFhGUGZVkpJEj0XE440FSc/AY4jMkpLom5LeDxwr6eXAPwO3lBOWWXWkJNFVwB7gXuAdZFNcXV1GUGZVknJ3rg5cnz/MLJfSAfUntLkGiohlHY3IrGJSpxFuGAZeD8ybZFuzgZEyKG9f02NnRPwV8HvlhWZWDSnNuebyjzWyM1NSaRazfpSSBB/lqWuiJ8iKer2+0wGZVU1KEv0LWRI1JigJ4NzGfCURsbazoZlVQ0oSnUZW2OsrZIn0KuBOsqHhySSNAgeAJ4EnImJE0jzgn8iq8Y0CqyPiF9M5vtlMSUmi5wAviogDAJKuAb4aEW8q8PnnRMTepuVGZfFrJV2VL7+vwPHNSpc679zjTcuP81TR4k6ZrLK4Wc9KORPdDNwp6Uv58vnAjQU+O4Cv5YWPP5EXM56ssvhhJF0CXAKwZIlHrFt3KaUEa36b+6x88baI+P60P1haHBE7JS0ENgGXAxsi4llN2/wiIo40dTEjIyOxZcuW6YZhlqLtfPNJv/NExN3A3Z2IJiJ25s+787Pbi8kri0fEQy2Vxc16VldKq0iaJWlO4zXwCrLCyo3K4nB4ZXGzntWtHgeLgC/lvzE9A/hMRGyU9D3aVxY361ldSaKI2AH8apv1+2hTWdysl7lSnllBTiKzgpxEZgU5icwKchKZFeQkMivISWRWUF8O767Xg9F9BxnbP86iucMsnT+LWq1ttyezwvouier1YOO2XaxZv5XxiTrDQzXWrl7BylNPcCJZKfquOTe67+ChBAIYn6izZv1WRvcdPMqeZtPTd0k0tn/8UAI1jE/U2X1gvEsRWb/ruyRaNHeY4aHD/6zhoRoL5wx3KSLrd32XREvnz2Lt6hWHEqlxTbR0/qwuR2b9qu9uLNRqYuWpJ/D8K85i94FxFs7x3TkrV98lEWSJtGzBbJYtmN3tUGwA9F1zzmymOYnMCnISmRWUNGVWL5K0h2w+hl53PLD3qFv1h379W/dGxMrWlZVPoqqQtCUiRo6+ZfUN0t8Kbs6ZFeYkMivISTRzrut2ADNokP5WXxOZFeUzkVlBTiKzgpxEM0DSqKR7JW2V1Fd1YCTdIGm3pPua1s2TtEnS9vz5iOVxqs5JNHPOiYgVffj7yY1A6w+QjbKhy4HN+XLfchJZIRFxG/DzltUDVTbUSTQzGqU178pLZfa7KZUN7Rd9OZ6oB53ZXFpT0n/n/4P3vYiIvC5v3/KZaAY0l9YEGqU1+9lYXi6UQSgb6iQq2RFKa/azgSob6h4LJZO0jOzsA0+V1vyzLobUUZI+C5xNNvxhDPgQ8GVgPbCEvGxoRLTefOgbTiKzgtycMyvISWRWkJPIrCAnkVlBTiKzgpxEfULS0uae1EfZ9ttlxzNInEQDKCLO6HYM/cRJ1IckLZP0fUm/LenOfBzTPZKW5+8/kj//Sf7eVkk7JX0qX/+mpv0+IemYbv49vc5J1GcknQx8AXgb8DrgryNiBTACPNi8bUR8MH/vbLLhDH8j6QXAhcBL8veeBH5/ZqKvJvfi7i8LyPqpXRAR90u6A/iApOcAX4yI7a07SBLwj8DaiLhL0mXAacD3src4lj7vQFqUz0T95WHgAeBMgIj4DPBq4P+AWyW9tM0+1wAPRsSn8mUBN+WjcFdExMkRcU3pkVeYk6i/PA68BniLpDfmnV93RMQ6sjPUC5s3lvQq4HeAK5pWbwZel499asyX8NwZib6i3JzrMxFxUNK5wCbgFuBCSRNkI0w/3LL5GmAxcGfedNsQER+UdDXZSNwaMAFcSjWKBnSFe3GbFeTmnFlBTiKzgpxEZgU5icwKchKZFeQkMivISWRW0P8DFSEH9VsdLG8AAAAASUVORK5CYII=",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.relplot(data=jaccard_df, x=\"ksize\", y=\"query_n_unique_kmers\", height=3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "5e8a9002-2073-41f7-a7d2-a4ca316a01ce",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:32.966566Z",
+ "iopub.status.busy": "2024-11-12T21:31:32.966453Z",
+ "iopub.status.idle": "2024-11-12T21:31:33.108691Z",
+ "shell.execute_reply": "2024-11-12T21:31:33.108401Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:32.966556Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPwAAADRCAYAAADomd+PAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAapUlEQVR4nO3de5hU1ZX38e+vu7lIy51GlIugIgmKAWzRRCYhShzUBMJo4iV5oxMzzuSJ0Ywmo8Ykw+DknfiYODET48QxRk28RB0xTOJ4I/iaiwotIAqIEkQuIjQICA100816/zi7TdHd1V1dVaeqq8/6PE89XXUu+yxKV51Tu87aW2aGcy4ZyoodgHOucDzhnUsQT3jnEsQT3rkE8YR3LkFKLuFnzJhhgD/8UYhHt1NyCb9t27Zih+BcySq5hHfOZc8T3rkE6TYJ33SwkaamxmKH4VyXVlHsAHJ10A6ydc/brHrnJRqa9jFu6CSO6nc0PSt6Fzs057qckj/Db6/bwsI3HmXz7nVs37uFP617gk3vrSt2WM51SSWf8O/sXo+1+AVl1ZaXONDUUKSInOu6Yk14STMkrZa0RtJ1bawfJWmhpKWSlks6p7PHqFCPNpcJZRm1c91XbAkvqRy4DTgbGA9cJGl8i82+BTxkZpOAC4GfdPY4w/qNpKLs0KQ/8cgpVJS3/iBwLuni7LSbAqwxs7UAkh4EZgErU7YxoF943h94u7MHGdiniunHn8/bu96kvqmeEf2PYXDlsBxDd657ijPhhwMbUl5vBE5tsc0c4ClJXwUqgeltNSTpcuBygFGjRrVaP6jPUAb1GZp7xM51c8XutLsIuNvMRgDnAL+Q1ComM7vDzKrNrLqqqqrgQTrXXcSZ8JuAkSmvR4RlqS4DHgIws+eB3sCQGGNyLtHiTPjFwFhJYyT1JOqUm99im/XAmQCSPkiU8LUxxuRcosWW8GbWCFwBPAmsIuqNXyFprqSZYbNrgL+T9DLwAHCp+aiazsVGpZZf1dXVVlNTU+wwXDJ0u5s5it1p55wrIE945xLEE965BPGEdy5BPOGdSxBPeOcSxBPeuQTxhHcuQTzhnUsQT3jnEsQT3rkE8YR3LkE84Z1LEE945xLEE965BPGEdy5BPOGdSxBPeOcSxBPeuQTxhHcuQTzhnUsQT3jnEsQT3rkE8YR3LkE84Z3LI0mjJb2a4bZ/ijueljzhnSsSM/tIoY/pCd/FNDQ2sGX3Fta9u44de3cUOxyXA0nHSFoq6WOSFklaJmm5pLFh/Z7wd25Yt0zSJkk/D8s/n7LfTyWV5xpTrAkvaYak1ZLWSLouzTaflbRS0gpJ98cZT1dX31jPoo013L/sV8xbMZ9fLn2A9Ts3FDsslwVJ44D/Bi4FzgduNbOJQDWwMXVbM/tOWDcNeBf4cZhN+QLg9LCuCfhcrnHFlvDh0+g24GxgPHCRpPEtthkLXE/0jzoB+Fpc8ZSCbXXbWLzhLxNlNh5s5KnXn6GuYW8Ro3JZqAJ+DXzOzF4Gnge+Kela4Ggz29dyB0kCfgncYmYvEU2jfjKwWNKy8PqYXAOL8ww/BVhjZmvNrAF4EJjVYpu/A24zsx0AZrY1xni6vD0Nda2W7a7fzf7GVv9/uK5tF7AemApgZvcDM4F9wOOSzmhjnznARjP7eXgt4B4zmxge48xsTq6BxZnww4HU69GNYVmq44HjJf1R0guSZsQYT5fXv3e/VsuqKodQ2aOyCNG4HDQAs4EvSLpY0jHAWjP7EdGZ/6TUjSV9CpgOXJmyeAFwvqShYZtBko7ONbBid9pVAGOJvrtcBPyXpAEtN5J0uaQaSTW1tbWFjbCABvcZzPTjzqCirAKAfr36Mn3smfTu0bvIkbnOMrM64JPAPwKfB14Nl+YnAve22PxqopNhcwfdXDNbCXwLeErScuBp4Mhc45KZ5dpG2w1LHwbmmNlfh9fXA5jZv6Vs85/Ai82XMZIWANeZ2eJ07VZXV1tNTU261SXPzNi5byf7G+vp17svlT397F5EKnYA+RbnGX4xMFbSGEk9gQuB+S22eYzo7I6kIUSX+GtjjKnLk8TAPgM5st8wT3aXd7ElvJk1AlcATwKrgIfMbEX4zXFm2OxJYLuklcBC4Btmtj2umJxLutgu6ePS3S/pXZfil/TOudLlCe9cgnjCO5cgnvDOdWGSRkpamFJvclUu7VXkKzDnku7+JbeWATOAycAS4ImLJ191MMdmG4FrzGyJpL7AS5KeDjfmdJqf4Z3Lg5Ds84DfAjeGv/PC8qyZ2WYzWxKe7yb6ibvlLeoZ84R3Lj9mEBXIpJoZlueFpNHAJODFbNvwhHcuPyanWT4pH41LOpyovv5rZvZetu14wjuXH0vSLF+aa8OSehAl+31m9mgubXnCO5cfT9C6VmR+WJ61MDDGz4BVZnZLLm2BJ7xzeRF642cD5xKVtZ4LzM5DL/3pwP8BzkgZ9+6cbBvze+mdS8/vpXfOla60N95I+g8g7enfzK5Mt8451zW1d4avAV4CehP95PBGeEwEesYemXMu79Ke4c3sHgBJXwamhgEtmoel+n1hwnPO5VMm3+EHAqnDqR4eljnnSkwmxTPfA5ZKWkjUa/lRojG0nXMlpt2El1QGrAZODQ+Aa83snbgDc85FwixONcAmM/tkLm21m/BmdlDSbWY2iWgAfedcGtc+8aNW5bE3zbgy1xtvAK4iqpJrPVNJJ2XyHX6BpPPCLX7OuTaEZG9VHhuWZ03SCKK79u7MOUgyS/i/Bx4G6iW9J2m3pKyrdZzrpuIqj/0h8E9APq4UOk54M+trZmVm1tPM+oXXOV9aONfN5L08VtInga1hNtm8yGiIK0kDieaAe3+SMzN7Ll9BONcNxFEeezowMxTL9Ab6SfqlmX0+2wY7PMNL+hLwHNEsMf8S/s7J9oDOdVN5L481s+vNbISZjSaaqu13uSQ7ZPYd/irgFOAtM/s40SXKzlwO6lx3E3rjW5XH5qmXPm86LI+VtNjMTglT3Z5qZvWSVpjZCQWJsAUvj3UF1O1+mcrkO/zGMGf7Y8DTknYAb8UZlHMuHpn00s82s51mNgf4NtFwO5/OpHFJMyStlrRG0nXtbHeeJJNUnWHczrksZNJpd1oYAB8z+3/As2TwU0O4HfA24GxgPHCRpPFtbNeXqJ8g66F3nXOZyaTT7nZgT8rrPWFZR6YAa8xsrZk1AA8Cs9rY7kbgJmB/Bm0653KQScLLUnr2zOwgmX33Hw5sSHm9kRYzZkiaDIw0s9+2G4B0uaQaSTW1tbUZHNo515ZMEn6tpCsl9QiPq4C1uR44VOLdAlzT0bZmdoeZVZtZdVVVVa6Hdi6xMkn4fwA+AmwiOkufClyewX6bgJEpr0eEZc36AicCz0paB5wGzPeOO+cOJWmApEckvSZplaQPZ9tWh5fmZraV6C6fzloMjJU0hijRLwQuTml3FzCk+bWkZ4Gvm5n/yO5K0uyft549dt7f5jwuPcCtwBNmdr6knkCfbBvKpJf+nvA7fPPrgZLu6mi/MAbeFUS34q4CHjKzFZLmSmpZVeRcSQvJ3qo8NizPmqT+RKNM/QzAzBrMbGe27WXS+XZS6gHMbIekjCqAzOxx4PEWy76TZttpmbTpXBfVXnns4603z9gYoBb4uaQPEY0kfZWZ1WXTWCafPmWhWg4ASYPIsMrOuQSJa/bYitD27WHkqTog7U1smTTWkR8Az0t6mOje4vOB72Z7QOe6qbhmj90IbDSz5hvTHiGHhM/k1tp7gfOALcA7wN+Y2S+yPaBz3VQss8eGAWM3SBoXFp0JrMy2vYwnk5Q0lEMHwFif7UFz4dVyroA6VS2X0ks/iejMnpdeekkTica060l0D8zfmtmOrNrKoDx2JtFl/VHAVuBoormqvTzWdXfdrjw2k067G4luinndzMYQXVK8EGtUzrlYZJLwB8xsO1FvfZmZLQT8bjjnSlAmvfQ7JR1ONIHkfZK2Ev004JwrMZmc4WcCe4lq1p8A1gA5TXfjnCuOtGd4SX8ws6lEP8c19+w1d2L8q6R3gZvN7Ccxx+icy5P25oefGv72bWu9pMHAnwBPeOdKRNY39oeOvGn5C8U515Kkf5S0QtKrkh6Q1LvjvdLL6Z54M9ucy/7OdSfV//eWVuWxNd+8OusbbyQNB64ExpvZPkkPEZWZ351tmzmV7jnnIiHZW5XHhuW5qAAOk1RBVAf/di6NecI7lx95nz3WzDYB3wfWA5uBXWb2VNYR4gnvXL7EMXvsQKKRnscQ3dpeKSn2ueWccx2Lozx2OvCmmdWa2QHgUaLxJbPmCe9cfsRRHrseOE1SH0kiqmNZlUN7nvDO5UPojW81e2wuvfRh0ItHiK4eXiHK1ztyiTPjeviuwstjXQElsjzWOddNeMI7lyCe8M4liCe8cwniCe9cgnjCO5cgnvDOdWGS7pK0VdKrKcsGSXpa0hvh78D22kgV65RRkmYQzXxZDtxpZt9rsf5q4EtAI9H8WV80s7fijMkVjpmxfe9Odu2vo2+vPgypHECZuu85ZtRXb25VHrv+P76R67j0dwM/Bu5NWXYdsMDMvifpuvD62kwai+3GG0nlwOvAJ4imy1kMXGRmK1O2+TjwopntlfRlYJqZXdBeu37jTelYuXUt97/8BAeaGqkoK+f8E8/kpGHHU15WMkmf8Y03IdnncWjF3Hxgdq5JL2k08BszOzG8Xk2UK5slHQk8a2bj2mujWZzv/BRgjZmtNbMG4EGiyp/3mdlCM9sbXr4AjIgxHldA2+t28qvlT3GgqRGAxoNNPPzKM9TWZTVhSinIe3lsO45IGXzmHeCITHeMM+GHAxtSXm8My9K5DPjftlZIulxSjaSa2traPIbo4rK7YS/7GxsOWdZkB3mvvtuOcB7X7LHtsugSPePL9C5xbRVqfKuBm9tab2Z3mFm1mVVXVVUVNjiXlb69+tC7ouchy8pVRr9elUWKKHZxzR7bli3hUp7wd2umO8aZ8JuAkSmvR4Rlh5A0HbgBmGlm9THG4wpocJ8BXHDSWfQs7wFARVk5n5nwCaoqM+5QLjWxzB6bxnzgkvD8EuDXme4YZ6ddBVGn3ZlEib4YuNjMVqRsM4mo/G+Gmb2RSbveaVc6zIxte3exe/8eDi/NXvpOVcul9NK/P3tsHjrsHiAaHXoI0RwR/ww8BjwEjALeAj5rZu9m1F6c5bGSzgF+SPSz3F1m9l1Jc4EaM5sv6RlgAtF4XQDrzaxlx8chPOFdAXW78livh3cuvW6X8CV1feWcy40nvHMJ4gnvXIJ4wjuXIJ7wziWIJ7xzXVia8tibJb0mabmkeZIGZNperOWxziXJcZ+e26o8ds1j34mjPPZp4Hoza5R0E3A9GZbH+hneuTwIyd5q9tiwPGtm9hzwbotlT5lZY3jZqSpTT3jn8qOQ5bGpvkiaKtO2eMI7lx8FL4+VdAPRaFH3ZbqPf4d3Lj8KWR6LpEuBTwJnWifuj/czvHP5UbDy2DBW5D8RlZTv7Wj7VJ7wzuVB6I1vNXtsrr30oTz2eWCcpI2SLiPqte8LPC1pmaT/zLg9r5ZzLi2vlnPOlS5PeOcSxBPeuQTxn+VcSWpobGTz7nfZe2A/Qyr7U1XZv9ghlQRPeFdy9jXU88TrS/jNqkUYUNmjF1dOncnYIUcVO7Quzy/pXclZv6uW/wnJDlB3oJ57X1rAnvp9RY2rFHjCu5KzY9+eVss2vfcuexr2FyGaeLVVHpuy7hpJJmlIpu35Jb0rOYP79G21bNSAKvr2OqwI0fzFhMnXtCqPfWXJD+Ioj0XSSOAsYH1nGvMzvCs5IwdU8dmTplIeJrUY0LuSS04+g8qevYsWU0j2VuWxYXnW2iqPDf6d6PbaTt0552d4V3J6V/Rk+tiJTBg2mr0H6hnSpy8D2zjrF1h75bGP5/NAkmYBm8zsZalzNwN6wruSVFFWzvD+g9Ou312/j407d9DQ1Mjw/gMZUhn7B0J75bF5S3hJfYBvEl3Od5onvOt2ttXt5qfP/46aDesAGFJ5ODdMn8XoQRn3bWWjUOWxxwJjgOaz+whgiaQpZvZORzv7d3jX7aza8vb7yQ6wrW4Pz7z+Ko0Hm+I8bEHKY83sFTMbamajzWw0sBGYnEmyQ8xn+FC3eyvRZJJ3mtn3WqzvRdT7eDKwHbjAzNbFGZPr/t56d9v7z6sq+3LWuJPYtOM9Fry2mmF9+1Nff5DDmsTmTe9SXl7GcccM4+iRuZ39X1nyg4MTJl8zmxazx+baS586e6ykjcA/m9nPsm0vtoSXVA7cBnyC6FNosaT5ZrYyZbPLgB1mdpykC4GbgAviisklw3FVw95/PmPcRH7y+98z44MncLBJrN28k6MqevHtf3uUP/95CwCTJo7mG1+fxYTxo3I6bkjux8njd3Yzu6iD9aM7016cl/RTgDVmttbMGoAHgVkttpkF3BOePwKcqc52OzrXwgeGHsm5H5zIsYOHsmTDRsrLyjhmUBW17+2jV3k5Cxe8+n6yAyxdto4XF60pYsSFE2fCDwc2pLzeGJa1uU0YdncX0KrrVdLlkmok1dTW1sYUrusuBhzWhy9Un87VH5vBgaYmBvXpQ2OTsbNuP0MP78Oyl9e12mfFik7dv1KySqLTzszuMLNqM6uuqqoqdjiuBPSsqOCo/gOZNeFDbK+ro0d5GQMrD2Pzrj2cMuXYVttP/NCYIkRZeHEm/CZgZMrrEWFZm9tIqgD6E3XeOZcXE0cM59szzmH73t0cMaAPViam/tUJTJw4+v1tpn3sBE45pfWHQHcU25h2IYFfB84kSuzFwMVmtiJlm68AE8zsH0Kn3d+Y2Wfba9fHtHPZ2H/gAA2NjWx5bw+79zbQu0nUvrOLiooyjj32CIZVDWhrt27XnxRbL32Y9+oK4Emin+XuMrMVkuYCNWY2H/gZ8AtJa4juF74wrnhcsvXu0YPePXrQ77CUApvjklc/76PWOpdetzvDl0SnnXMuPzzhnUuQkrukl1QLvBVT80OAbR1uVRhdKRboWvEUKpZtZhb37K8FVXIJHydJNWZWXew4oGvFAl0rnq4US6nxS3rnEsQT3rkE8YQ/1B3FDiBFV4oFulY8XSmWkuLf4Z1LED/DO5cgnvDOJUhiE17SOkmvSFomqSYsGyTpaUlvhL8DYzx+qxlF0h1fkR9JWiNpuaR0I6TmO545kjaF92iZpHNS1l0f4lkt6a/zHMtISQslrZS0QtJVYXnR3p/uIrEJH3zczCam/KZ7HbDAzMYCC8LruNxNNP5ZqnTHPxsYGx6XA7cXKB6Afw/v0UQzexxA0niiQqcTwj4/CUOa5UsjcI2ZjQdOA74SjlnM96dbSHrCt5Q65NY9wKfjOlCaGUXSHX8WcK9FXgAGSDqyAPGkMwt40MzqzexNYA3RkGb5imWzmS0Jz3cDq4hGRyra+9NdJDnhDXhK0kuSLg/LjjCzzeH5O8ARBY4p3fEzGS4sLleEy+S7Ur7iFCweSaOJRoF9ka75/pSUJCf8VDObTHQ5+BVJH01dadHvlUX7zbLYxw9uJ5r4YCKwGfhBIQ8u6XDgv4Gvmdl7qeu6yPtTchKb8Ga2KfzdSjQJ4BRgS/OlYPi7tcBhpTt+JsOF5Z2ZbTGzJjM7CPwXf7lsjz0eST2Ikv0+M3s0LO5S708pSmTCS6qU1Lf5OdE8Xa8SzRRySdjsEuDXBQ4t3fHnA18IvdGnAbtSLm1j0+J78Gyi96g5ngsl9ZI0hqizbFEejyui0ZBWmdktKau61PtTkswscQ/gGODl8FgB3BCWDybq/X0DeAYYFGMMDxBdJh8g+s55WbrjE428chvwZ+AVoLpA8fwiHG85UVIdmbL9DSGe1cDZeY5lKtHl+nJgWXicU8z3p7s8/NZa5xIkkZf0ziWVJ7xzCeIJ71yCeMI7lyCe8M4liCe8cwniCZ8Qkv5UoONcKunHhTiW6zxP+BiEiTS7FDP7SLFjyJc8l+Imiic8IOkGSa9L+oOkByR9XdKzkqrD+iGS1oXn5ZJulrQ4VJH9fVg+TdLvJc0HVkqaK+lrKcf4bvNADm0cf1o43iOSXpN0X7i9NF286yQNCc+rJT0bns8JVW3PSlor6cqUffaEv5L04zBwxTOSHpd0fgftVoZ2F0laKmlWhu/ruZKeD+/f3ZJul/RCiG1aaHOVpLtT9jkr7LNE0sOhgKY5tpskLQE+I+lKRQNkLJf0YCbxuBhnjy0Vkk4mGsxhItH7sQR4qZ1dLiO6V/sUSb2AP0p6KqybDJxoZm+Gss5HgR9KKgvHaK9mfBLRgBJvA38ETgf+kMU/6QPAx4G+wGpJt5vZgZT1s4FxwHii8tKVwF0dtHkD8Dsz+6KkAcAiSc+YWV26HSTNBq4GzjGzHeHzayDwYWAm0a26pwNfAhZLmkh0S++3gOlmVifp2tDG3NDsdosqHJH0NjDGzOpDTC4DiU944K+AeWa2FyCcodtzFnBS81kR6E9UPNIALLJoQAjMbJ2k7ZImESXWUjPb3k67i8xsY4hhGTCa7BL+t2ZWD9RL2hqOvTFl/UeBB8ysCXhb0u8yaPMsYKakr4fXvYFRRANTtOUMoBo4yw4ta/0fMzNJrwBbzOwVAEkriP69I4g+iP4YPiB6As+n7P+rlOfLgfskPQY8lsG/weEJ355G/vKVp3fKcgFfNbMnUzeWNA1oeca7E7gUGEbHZ9H6lOdNtP/fJl1snW0n03YFnGdmqzNs589EBUrHA6lzezfHdrBFnAdDnE3A02Z2UZp2U9/fc4k+vD4F3CBpgpk1ZhhfYvl3eHgO+LSkwxSVzH4qLF8HnByen5+y/ZPAlxXVayPpeEUltm2ZRzTm2ylhv3xJje28Tu77HHBB6Is4kujyv6N2nwS+2tyvEK5a2vNW2P9eSSd0IrYXgNMlHReOUynp+JYbha9II81sIXAt0VXW4Z04TmIlPuEtGjvtV0Slsv8LLA6rvk+U2EuJZittdifR994likZ4/SlpzqJm1gAsBB4Kl9D58i/ArYpG2+1su/OIyktXAvdy6CVzunZvBHoAy8Pl940dHcTMXgM+Bzws6dhMAjOzWqIrogckLQ+xfaCNTcuBX4avBkuBH5nZzkyOkXReHtuCpDnAHjP7fh7aKiPqBPyMmb2Ra3txCD3kvzGzR4odi4tf4s/wcVE0rPIaomGVu2Syu+TxM3wBSZpANIpMqnozOzXN9vOAMS0WX9uyw7AYFE0+cVOLxW+a2exixOMy4wnvXIL4Jb1zCeIJ71yCeMI7lyCe8M4lyP8Hquob3/fhN50AAAAASUVORK5CYII=",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.relplot(\n",
+ " data=jaccard_df,\n",
+ " x=\"query_n_unique_kmers\",\n",
+ " y=\"jaccard\",\n",
+ " height=3,\n",
+ " hue=\"ksize\",\n",
+ " palette=\"crest\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2aaf46a0",
+ "metadata": {},
+ "source": [
+ "# This value was dropped from .5 in the hp dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "19421cc7-be9f-45fc-b848-f29aa449a11e",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:33.109164Z",
+ "iopub.status.busy": "2024-11-12T21:31:33.109056Z",
+ "iopub.status.idle": "2024-11-12T21:31:33.115226Z",
+ "shell.execute_reply": "2024-11-12T21:31:33.115001Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:33.109153Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " query | \n",
+ " match | \n",
+ " moltype | \n",
+ " ksize | \n",
+ " jaccard | \n",
+ " query_n_kmers | \n",
+ " query_n_unique_kmers | \n",
+ " match_n_kmers | \n",
+ " match_n_unique_kmers | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 2 | \n",
+ " 0.818182 | \n",
+ " 238 | \n",
+ " 28 | \n",
+ " 279 | \n",
+ " 32 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 3 | \n",
+ " 0.606061 | \n",
+ " 237 | \n",
+ " 100 | \n",
+ " 278 | \n",
+ " 112 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 4 | \n",
+ " 0.198113 | \n",
+ " 236 | \n",
+ " 172 | \n",
+ " 277 | \n",
+ " 209 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 5 | \n",
+ " 0.055046 | \n",
+ " 235 | \n",
+ " 204 | \n",
+ " 276 | \n",
+ " 256 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " BCL2 | \n",
+ " ced9 | \n",
+ " dayhoff | \n",
+ " 6 | \n",
+ " 0.016667 | \n",
+ " 234 | \n",
+ " 215 | \n",
+ " 275 | \n",
+ " 273 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n",
+ "0 BCL2 ced9 dayhoff 2 0.818182 238 28 \n",
+ "1 BCL2 ced9 dayhoff 3 0.606061 237 100 \n",
+ "2 BCL2 ced9 dayhoff 4 0.198113 236 172 \n",
+ "3 BCL2 ced9 dayhoff 5 0.055046 235 204 \n",
+ "4 BCL2 ced9 dayhoff 6 0.016667 234 215 \n",
+ "\n",
+ " match_n_kmers match_n_unique_kmers \n",
+ "0 279 32 \n",
+ "1 278 112 \n",
+ "2 277 209 \n",
+ "3 276 256 \n",
+ "4 275 273 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "jaccard_df.query(\"jaccard > 0.01\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "02f77666-adb5-4aa2-b660-3807ca0848bb",
+ "metadata": {},
+ "source": [
+ "# Functionalize the analysis for any two sequences\n",
+ "\n",
+ "Thanks Claude"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "cb300f9d-acbf-4663-91df-3689d270e907",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:33.115667Z",
+ "iopub.status.busy": "2024-11-12T21:31:33.115570Z",
+ "iopub.status.idle": "2024-11-12T21:31:33.206289Z",
+ "shell.execute_reply": "2024-11-12T21:31:33.206015Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:33.115658Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " query | \n",
+ " match | \n",
+ " moltype | \n",
+ " ksize | \n",
+ " jaccard | \n",
+ " query_n_kmers | \n",
+ " query_n_unique_kmers | \n",
+ " query_intersection_positions | \n",
+ " match_n_kmers | \n",
+ " match_n_unique_kmers | \n",
+ " match_intersection_positions | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " CED9 | \n",
+ " BCL2 | \n",
+ " dayhoff | \n",
+ " 5 | \n",
+ " 0.055046 | \n",
+ " 276 | \n",
+ " 256 | \n",
+ " [[21, bbced], [27, febed], [28, ebedb], [32, b... | \n",
+ " 235 | \n",
+ " 204 | \n",
+ " [[10, cdcee], [24, cdbfc], [33, cebbb], [34, e... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CED9 | \n",
+ " BCL2 | \n",
+ " dayhoff | \n",
+ " 6 | \n",
+ " 0.016667 | \n",
+ " 275 | \n",
+ " 273 | \n",
+ " [[27, febedb], [46, cebbbb], [48, bbbbdc], [49... | \n",
+ " 234 | \n",
+ " 215 | \n",
+ " [[33, cebbbb], [58, bbbbdc], [59, bbbdcb], [81... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " CED9 | \n",
+ " BCL2 | \n",
+ " dayhoff | \n",
+ " 7 | \n",
+ " 0.002033 | \n",
+ " 274 | \n",
+ " 274 | \n",
+ " [[48, bbbbdcb]] | \n",
+ " 233 | \n",
+ " 219 | \n",
+ " [[58, bbbbdcb]] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " CED9 | \n",
+ " BCL2 | \n",
+ " dayhoff | \n",
+ " 8 | \n",
+ " 0.000000 | \n",
+ " 273 | \n",
+ " 273 | \n",
+ " [] | \n",
+ " 232 | \n",
+ " 222 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " CED9 | \n",
+ " BCL2 | \n",
+ " dayhoff | \n",
+ " 9 | \n",
+ " 0.000000 | \n",
+ " 272 | \n",
+ " 272 | \n",
+ " [] | \n",
+ " 231 | \n",
+ " 223 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " CED9 | \n",
+ " BCL2 | \n",
+ " dayhoff | \n",
+ " 10 | \n",
+ " 0.000000 | \n",
+ " 271 | \n",
+ " 271 | \n",
+ " [] | \n",
+ " 230 | \n",
+ " 224 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n",
+ "0 CED9 BCL2 dayhoff 5 0.055046 276 256 \n",
+ "1 CED9 BCL2 dayhoff 6 0.016667 275 273 \n",
+ "2 CED9 BCL2 dayhoff 7 0.002033 274 274 \n",
+ "3 CED9 BCL2 dayhoff 8 0.000000 273 273 \n",
+ "4 CED9 BCL2 dayhoff 9 0.000000 272 272 \n",
+ "5 CED9 BCL2 dayhoff 10 0.000000 271 271 \n",
+ "\n",
+ " query_intersection_positions match_n_kmers \\\n",
+ "0 [[21, bbced], [27, febed], [28, ebedb], [32, b... 235 \n",
+ "1 [[27, febedb], [46, cebbbb], [48, bbbbdc], [49... 234 \n",
+ "2 [[48, bbbbdcb]] 233 \n",
+ "3 [] 232 \n",
+ "4 [] 231 \n",
+ "5 [] 230 \n",
+ "\n",
+ " match_n_unique_kmers match_intersection_positions \n",
+ "0 204 [[10, cdcee], [24, cdbfc], [33, cebbb], [34, e... \n",
+ "1 215 [[33, cebbbb], [58, bbbbdc], [59, bbbdcb], [81... \n",
+ "2 219 [[58, bbbbdcb]] \n",
+ "3 222 [] \n",
+ "4 223 [] \n",
+ "5 224 [] "
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from typing import Dict, List, Set\n",
+ "\n",
+ "import pandas as pd\n",
+ "\n",
+ "\n",
+ "class Sequence:\n",
+ " def __init__(self, sequence: str, name: str, moltype: str = \"dayhoff\"):\n",
+ " self.sequence = sequence\n",
+ " self.name = name\n",
+ " self.moltype = moltype\n",
+ " self.processed_seq = self._process_sequence()\n",
+ "\n",
+ " def _process_sequence(self) -> str:\n",
+ " return degenerate_protein_chatgpt(self.sequence, self.moltype)\n",
+ "\n",
+ " def get_kmers(self, k: int) -> List[str]:\n",
+ " return [\n",
+ " self.processed_seq[i : i + k]\n",
+ " for i in range(len(self.processed_seq) - k + 1)\n",
+ " ]\n",
+ "\n",
+ "\n",
+ "class KmerAnalyzer:\n",
+ " def __init__(self, seq1: Sequence, seq2: Sequence):\n",
+ " self.seq1 = seq1\n",
+ " self.seq2 = seq2\n",
+ "\n",
+ " def calculate_jaccard(self, kmer_set1: Set[str], kmer_set2: Set[str]) -> float:\n",
+ " intersection = len(kmer_set1.intersection(kmer_set2))\n",
+ " union = len(kmer_set1.union(kmer_set2))\n",
+ " return intersection / union if union > 0 else 0.0\n",
+ "\n",
+ " def analyze_kmer_range(self, start_k: int, end_k: int) -> pd.DataFrame:\n",
+ " results = []\n",
+ " for k in range(start_k, end_k + 1):\n",
+ " results.append(self._analyze_single_k(k))\n",
+ " return pd.DataFrame(results, columns=self._get_column_names())\n",
+ "\n",
+ " def get_intersecting_kmer_positions(self, kmers1, kmers2, kmer_set1, kmer_set2):\n",
+ " intersection = kmer_set1.intersection(kmer_set2)\n",
+ "\n",
+ " positions1 = []\n",
+ " positions2 = []\n",
+ " for kmer in intersection:\n",
+ " positions1.append([kmers1.index(kmer), kmer])\n",
+ " positions2.append([kmers2.index(kmer), kmer])\n",
+ "\n",
+ " positions1 = sorted(positions1, key=lambda x: x[0])\n",
+ " positions2 = sorted(positions2, key=lambda x: x[0])\n",
+ " return positions1, positions2\n",
+ "\n",
+ " def _analyze_single_k(self, k: int) -> List:\n",
+ " kmers1 = self.seq1.get_kmers(k)\n",
+ " kmers2 = self.seq2.get_kmers(k)\n",
+ " set1 = set(kmers1)\n",
+ " set2 = set(kmers2)\n",
+ "\n",
+ " pos1, pos2 = self.get_intersecting_kmer_positions(kmers1, kmers2, set1, set2)\n",
+ "\n",
+ " return [\n",
+ " self.seq1.name,\n",
+ " self.seq2.name,\n",
+ " self.seq1.moltype,\n",
+ " k,\n",
+ " self.calculate_jaccard(set1, set2),\n",
+ " len(kmers1),\n",
+ " len(set1),\n",
+ " pos1,\n",
+ " len(kmers2),\n",
+ " len(set2),\n",
+ " pos2,\n",
+ " ]\n",
+ "\n",
+ " def _get_column_names(self) -> List[str]:\n",
+ " return [\n",
+ " \"query\",\n",
+ " \"match\",\n",
+ " \"moltype\",\n",
+ " \"ksize\",\n",
+ " \"jaccard\",\n",
+ " \"query_n_kmers\",\n",
+ " \"query_n_unique_kmers\",\n",
+ " \"query_intersection_positions\",\n",
+ " \"match_n_kmers\",\n",
+ " \"match_n_unique_kmers\",\n",
+ " \"match_intersection_positions\",\n",
+ " ]\n",
+ "\n",
+ "\n",
+ "# Example usage\n",
+ "def compare_sequences(\n",
+ " seq1_str: str,\n",
+ " seq2_str: str,\n",
+ " seq1_name: str,\n",
+ " seq2_name: str,\n",
+ " start_k: int = 5,\n",
+ " end_k: int = 30,\n",
+ ") -> pd.DataFrame:\n",
+ " seq1 = Sequence(seq1_str, seq1_name)\n",
+ " seq2 = Sequence(seq2_str, seq2_name)\n",
+ " analyzer = KmerAnalyzer(seq1, seq2)\n",
+ " return analyzer.analyze_kmer_range(start_k, end_k)\n",
+ "\n",
+ "\n",
+ "# Usage example:\n",
+ "df = compare_sequences(ced9_seq, bcl2_seq, \"CED9\", \"BCL2\", end_k=10)\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "16bebea4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[[21, 'bbced'],\n",
+ " [27, 'febed'],\n",
+ " [28, 'ebedb'],\n",
+ " [32, 'bbcbb'],\n",
+ " [46, 'cebbb'],\n",
+ " [47, 'ebbbb'],\n",
+ " [48, 'bbbbd'],\n",
+ " [49, 'bbbdc'],\n",
+ " [50, 'bbdcb'],\n",
+ " [94, 'cbecf'],\n",
+ " [100, 'bbbbe'],\n",
+ " [101, 'bbbeb'],\n",
+ " [111, 'cdcee'],\n",
+ " [137, 'eebeb'],\n",
+ " [167, 'fbdee'],\n",
+ " [168, 'bdeeb'],\n",
+ " [171, 'ebeeb'],\n",
+ " [177, 'bbfeb'],\n",
+ " [185, 'ecbec'],\n",
+ " [217, 'cdbfc'],\n",
+ " [252, 'beebb'],\n",
+ " [256, 'bbebb'],\n",
+ " [262, 'bebee'],\n",
+ " [264, 'beebe']]"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kmer5ced9bcl2intersection = df.iloc[0]['query_intersection_positions']\n",
+ "kmer5ced9bcl2intersection"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "251ce38c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[[10, 'cdcee'],\n",
+ " [24, 'cdbfc'],\n",
+ " [33, 'cebbb'],\n",
+ " [34, 'ebbbb'],\n",
+ " [43, 'bbbbe'],\n",
+ " [49, 'bbcbb'],\n",
+ " [58, 'bbbbd'],\n",
+ " [59, 'bbbdc'],\n",
+ " [60, 'bbdcb'],\n",
+ " [82, 'bbbeb'],\n",
+ " [83, 'bbebb'],\n",
+ " [115, 'bbced'],\n",
+ " [139, 'cbecf'],\n",
+ " [143, 'fbdee'],\n",
+ " [144, 'bdeeb'],\n",
+ " [158, 'ecbec'],\n",
+ " [213, 'febed'],\n",
+ " [214, 'ebedb'],\n",
+ " [218, 'beebe'],\n",
+ " [219, 'eebeb'],\n",
+ " [221, 'bebee'],\n",
+ " [222, 'ebeeb'],\n",
+ " [223, 'beebb'],\n",
+ " [232, 'bbfeb']]"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kmer5ced9bcl2match = df.iloc[0]['match_intersection_positions']\n",
+ "kmer5ced9bcl2match"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "043aff40",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[[27, 'febedb'],\n",
+ " [46, 'cebbbb'],\n",
+ " [48, 'bbbbdc'],\n",
+ " [49, 'bbbdcb'],\n",
+ " [100, 'bbbbeb'],\n",
+ " [167, 'fbdeeb'],\n",
+ " [255, 'bbbebb'],\n",
+ " [262, 'bebeeb']]"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kmer6ced9bcl2intersection = df.iloc[1]['query_intersection_positions']\n",
+ "kmer6ced9bcl2intersection"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "85996100",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[[33, 'cebbbb'],\n",
+ " [58, 'bbbbdc'],\n",
+ " [59, 'bbbdcb'],\n",
+ " [81, 'bbbbeb'],\n",
+ " [82, 'bbbebb'],\n",
+ " [143, 'fbdeeb'],\n",
+ " [213, 'febedb'],\n",
+ " [221, 'bebeeb']]"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kmer6ced9bcl2match = df.iloc[1]['match_intersection_positions']\n",
+ "kmer6ced9bcl2match"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "29e43f64-b3f0-4bdd-9286-004d0749ffe9",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:33.206756Z",
+ "iopub.status.busy": "2024-11-12T21:31:33.206652Z",
+ "iopub.status.idle": "2024-11-12T21:31:33.237938Z",
+ "shell.execute_reply": "2024-11-12T21:31:33.237703Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:33.206746Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " query | \n",
+ " match | \n",
+ " moltype | \n",
+ " ksize | \n",
+ " jaccard | \n",
+ " query_n_kmers | \n",
+ " query_n_unique_kmers | \n",
+ " query_intersection_positions | \n",
+ " match_n_kmers | \n",
+ " match_n_unique_kmers | \n",
+ " match_intersection_positions | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " CED9 | \n",
+ " BCL2 | \n",
+ " dayhoff | \n",
+ " 5 | \n",
+ " 0.055046 | \n",
+ " 276 | \n",
+ " 256 | \n",
+ " [[21, bbced], [27, febed], [28, ebedb], [32, b... | \n",
+ " 235 | \n",
+ " 204 | \n",
+ " [[10, cdcee], [24, cdbfc], [33, cebbb], [34, e... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CED9 | \n",
+ " BCL2 | \n",
+ " dayhoff | \n",
+ " 6 | \n",
+ " 0.016667 | \n",
+ " 275 | \n",
+ " 273 | \n",
+ " [[27, febedb], [46, cebbbb], [48, bbbbdc], [49... | \n",
+ " 234 | \n",
+ " 215 | \n",
+ " [[33, cebbbb], [58, bbbbdc], [59, bbbdcb], [81... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n",
+ "0 CED9 BCL2 dayhoff 5 0.055046 276 256 \n",
+ "1 CED9 BCL2 dayhoff 6 0.016667 275 273 \n",
+ "\n",
+ " query_intersection_positions match_n_kmers \\\n",
+ "0 [[21, bbced], [27, febed], [28, ebedb], [32, b... 235 \n",
+ "1 [[27, febedb], [46, cebbbb], [48, bbbbdc], [49... 234 \n",
+ "\n",
+ " match_n_unique_kmers match_intersection_positions \n",
+ "0 204 [[10, cdcee], [24, cdbfc], [33, cebbb], [34, e... \n",
+ "1 215 [[33, cebbbb], [58, bbbbdc], [59, bbbdcb], [81... "
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.query(\"ksize >= 5 and ksize <= 6\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bba34ea5-8164-42b3-ba03-054e225b2963",
+ "metadata": {},
+ "source": [
+ "### Make Sourmash signatures for CED9, BCL2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f91b6c5b-254d-4a33-9afe-e1d442a13f65",
+ "metadata": {},
+ "source": [
+ "#### CED9"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "cdacb2ab-faa7-45f4-b43b-6e43ff61375f",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:54:04.752153Z",
+ "iopub.status.busy": "2024-11-12T21:54:04.751809Z",
+ "iopub.status.idle": "2024-11-12T21:54:04.754887Z",
+ "shell.execute_reply": "2024-11-12T21:54:04.754601Z",
+ "shell.execute_reply.started": "2024-11-12T21:54:04.752139Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'MTRCTADNSLTNPAYRRRTMATGEMKEFLGIKGTEPTDFGINSDAQDLPSPSRQASTRRMSIGESIDGKINDWEEPRLDIEGFVVDYFTHRIRQNGMEWFGAPGLPCGVQPEHEMMRVMGTIFEKKHAENFETFCEQLLAVPRISFSLYQDVVRTVGNAQTDQCPMSYGRLIGLISFGGFVAAKMMESVELQGQVRNLFVYTSLFIKTRIRNNWKEHNRSWDDFMTLGKQMKEDYERAEAEKVGRRKQNRRWSMIGAGVTAGAIGIVGVVVCGRMMFSLK'"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ced9_seq"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "6f770cb4-4c3a-4968-b3ae-27ccede71ecb",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:54:53.753581Z",
+ "iopub.status.busy": "2024-11-12T21:54:53.753250Z",
+ "iopub.status.idle": "2024-11-12T21:54:53.756533Z",
+ "shell.execute_reply": "2024-11-12T21:54:53.756224Z",
+ "shell.execute_reply.started": "2024-11-12T21:54:53.753567Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Overwriting test-data/ced9.fasta\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%file test-data/ced9.fasta\n",
+ ">ced9\n",
+ "MTRCTADNSLTNPAYRRRTMATGEMKEFLGIKGTEPTDFGINSDAQDLPSPSRQASTRRMSIGESIDGKINDWEEPRLDIEGFVVDYFTHRIRQNGMEWFGAPGLPCGVQPEHEMMRVMGTIFEKKHAENFETFCEQLLAVPRISFSLYQDVVRTVGNAQTDQCPMSYGRLIGLISFGGFVAAKMMESVELQGQVRNLFVYTSLFIKTRIRNNWKEHNRSWDDFMTLGKQMKEDYERAEAEKVGRRKQNRRWSMIGAGVTAGAIGIVGVVVCGRMMFSLK"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b3a9e2d8-84e2-44a0-a9ec-2f15a41d9340",
+ "metadata": {},
+ "source": [
+ "#### P66"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "0b2b11c3-549c-4b50-86cf-46b68830f52e",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:54:56.856675Z",
+ "iopub.status.busy": "2024-11-12T21:54:56.856350Z",
+ "iopub.status.idle": "2024-11-12T21:54:56.859329Z",
+ "shell.execute_reply": "2024-11-12T21:54:56.859070Z",
+ "shell.execute_reply.started": "2024-11-12T21:54:56.856662Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAPGIFSSQPGHTPHPAASRDPVARTSPLQTPAAPGAAAGPALSPVPPVVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYGPSMRPLFDFSWLSLKTLLSLALVGACITLGAYLGHK'"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bcl2_seq"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "0a0eb7e6-7cf0-473c-a3b9-651bdfa074eb",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:55:06.101451Z",
+ "iopub.status.busy": "2024-11-12T21:55:06.101120Z",
+ "iopub.status.idle": "2024-11-12T21:55:06.104798Z",
+ "shell.execute_reply": "2024-11-12T21:55:06.104500Z",
+ "shell.execute_reply.started": "2024-11-12T21:55:06.101437Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Overwriting test-data/bcl2.fasta\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%file test-data/bcl2.fasta\n",
+ ">bcl2\n",
+ "MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAPGIFSSQPGHTPHPAASRDPVARTSPLQTPAAPGAAAGPALSPVPPVVHLTLRQAGDDFSRRYRRDFAEMSSQLHLTPFTARGRFATVVEELFRDGVNWGRIVAFFEFGGVMCVESVNREMSPLVDNIALWMTEYLNRHLHTWIQDNGGWDAFVELYGPSMRPLFDFSWLSLKTLLSLALVGACITLGAYLGHK"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b43fab68-3588-492d-9e7b-f8fab806725a",
+ "metadata": {},
+ "source": [
+ "### Compute signatures"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "52d8bf59-8e38-4578-825d-82e780b6a513",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:55:37.747266Z",
+ "iopub.status.busy": "2024-11-12T21:55:37.746938Z",
+ "iopub.status.idle": "2024-11-12T21:55:39.944359Z",
+ "shell.execute_reply": "2024-11-12T21:55:39.943940Z",
+ "shell.execute_reply.started": "2024-11-12T21:55:37.747252Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\u001b[K\n",
+ "== This is sourmash version 4.8.12. ==\n",
+ "\n",
+ "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n",
+ "\n",
+ "\n",
+ "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n",
+ "\n",
+ "\u001b[Kcomputing signatures for files: test-data/ced9.fasta\n",
+ "\n",
+ "\u001b[KComputing a total of 1 signature(s) for each input.\n",
+ "\n",
+ "\u001b[K... reading sequences from test-data/ced9.fasta\n",
+ "\n",
+ "\u001b[K... test-data/ced9.fasta 1 sequences\n",
+ "\n",
+ "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n",
+ "\n",
+ "\u001b[Ksaved 1 signature(s) to 'test-data/ced9.dayhoff.k5.sig'\n",
+ "\n",
+ "\u001b[K\n",
+ "== This is sourmash version 4.8.12. ==\n",
+ "\n",
+ "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n",
+ "\n",
+ "\n",
+ "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n",
+ "\n",
+ "\u001b[Kcomputing signatures for files: test-data/bcl2.fasta\n",
+ "\n",
+ "\u001b[KComputing a total of 1 signature(s) for each input.\n",
+ "\n",
+ "\u001b[K... reading sequences from test-data/bcl2.fasta\n",
+ "\n",
+ "\u001b[K... test-data/bcl2.fasta 1 sequences\n",
+ "\n",
+ "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n",
+ "\n",
+ "\u001b[Ksaved 1 signature(s) to 'test-data/bcl2.dayhoff.k5.sig'\n",
+ "\n",
+ "\u001b[K\n",
+ "== This is sourmash version 4.8.12. ==\n",
+ "\n",
+ "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n",
+ "\n",
+ "\n",
+ "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n",
+ "\n",
+ "\u001b[Kcomputing signatures for files: test-data/ced9.fasta\n",
+ "\n",
+ "\u001b[KComputing a total of 1 signature(s) for each input.\n",
+ "\n",
+ "\u001b[K... reading sequences from test-data/ced9.fasta\n",
+ "\n",
+ "\u001b[K... test-data/ced9.fasta 1 sequences\n",
+ "\n",
+ "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n",
+ "\n",
+ "\u001b[Ksaved 1 signature(s) to 'test-data/ced9.dayhoff.k6.sig'\n",
+ "\n",
+ "\u001b[K\n",
+ "== This is sourmash version 4.8.12. ==\n",
+ "\n",
+ "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n",
+ "\n",
+ "\n",
+ "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n",
+ "\n",
+ "\u001b[Kcomputing signatures for files: test-data/bcl2.fasta\n",
+ "\n",
+ "\u001b[KComputing a total of 1 signature(s) for each input.\n",
+ "\n",
+ "\u001b[K... reading sequences from test-data/bcl2.fasta\n",
+ "\n",
+ "\u001b[K... test-data/bcl2.fasta 1 sequences\n",
+ "\n",
+ "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n",
+ "\n",
+ "\u001b[Ksaved 1 signature(s) to 'test-data/bcl2.dayhoff.k6.sig'\n"
+ ]
+ }
+ ],
+ "source": [
+ "ksizes = 5, 6\n",
+ "ced9_bcl2_fastas = {\"ced9\": \"test-data/ced9.fasta\", \"bcl2\": \"test-data/bcl2.fasta\"}\n",
+ "ced9_bcl2_sigfiles = {}\n",
+ "\n",
+ "\n",
+ "for ksize in ksizes:\n",
+ " for name, fasta in ced9_bcl2_fastas.items():\n",
+ " param_string = f\"dayhoff,scaled=1,k={ksize}\"\n",
+ " sig = f\"test-data/{name}.dayhoff.k{ksize}.sig\"\n",
+ " ! sourmash sketch protein -p $param_string --name $name $fasta -o $sig\n",
+ " ced9_bcl2_sigfiles[(name, ksize)] = sig"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "39fa469c-4799-4c64-b683-816ae26948ca",
+ "metadata": {},
+ "source": [
+ "### Load Signatures"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "953365a4-f46e-45ae-9fb9-944c1af5e43b",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:55:57.556563Z",
+ "iopub.status.busy": "2024-11-12T21:55:57.556065Z",
+ "iopub.status.idle": "2024-11-12T21:55:57.563257Z",
+ "shell.execute_reply": "2024-11-12T21:55:57.563020Z",
+ "shell.execute_reply.started": "2024-11-12T21:55:57.556546Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{5: {'ced9': SourmashSignature('ced9', 1dd6b6f2),\n",
+ " 'bcl2': SourmashSignature('bcl2', 49f32c24)},\n",
+ " 6: {'ced9': SourmashSignature('ced9', 1dd6b6f2),\n",
+ " 'bcl2': SourmashSignature('bcl2', 49f32c24)}}"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ksizes = 5, 6\n",
+ "ced9_bcl2_sigs = dict.fromkeys(ksizes, {})\n",
+ "\n",
+ "for (name, ksize), sigfile in ced9_bcl2_sigfiles.items():\n",
+ " ced9_bcl2_sigs[ksize][name] = list(sourmash.load_file_as_signatures(sigfile))[0]\n",
+ "ced9_bcl2_sigs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2795adfb-6bbd-47ef-8bd4-a22a22129813",
+ "metadata": {},
+ "source": [
+ "### Show SigSeq Alignment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "12131870-92f9-42cd-9b76-842dd2b429aa",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:56:30.609180Z",
+ "iopub.status.busy": "2024-11-12T21:56:30.608857Z",
+ "iopub.status.idle": "2024-11-12T21:56:30.618914Z",
+ "shell.execute_reply": "2024-11-12T21:56:30.618645Z",
+ "shell.execute_reply.started": "2024-11-12T21:56:30.609167Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ksize: 8\n"
+ ]
+ },
+ {
+ "ename": "IndexError",
+ "evalue": "list index out of range",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[20], line 10\u001b[0m\n\u001b[0;32m 7\u001b[0m bcl2_sigseq \u001b[38;5;241m=\u001b[39m SigSeq(sigs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbcl2\u001b[39m\u001b[38;5;124m\"\u001b[39m], bcl2_seq)\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 10\u001b[0m \u001b[43mced9_sigseq\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdisplay_alignment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbcl2_sigseq\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28mprint\u001b[39m(e)\n",
+ "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:244\u001b[0m, in \u001b[0;36mSigSeq.display_alignment\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 242\u001b[0m \u001b[38;5;124;03m\"\"\"Displays the alignment between two sequences\"\"\"\u001b[39;00m\n\u001b[0;32m 243\u001b[0m \u001b[38;5;66;03m# Compute overlaps\u001b[39;00m\n\u001b[1;32m--> 244\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_overlap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 245\u001b[0m other\u001b[38;5;241m.\u001b[39mcompute_overlap(\u001b[38;5;28mself\u001b[39m)\n\u001b[0;32m 247\u001b[0m \u001b[38;5;66;03m# Verify overlaps\u001b[39;00m\n",
+ "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:194\u001b[0m, in \u001b[0;36mSigSeq.compute_overlap\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 191\u001b[0m overlap_encoded \u001b[38;5;241m=\u001b[39m KmerStitcher\u001b[38;5;241m.\u001b[39mstitch_kmers(overlap, use_encoded\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 193\u001b[0m index \u001b[38;5;241m=\u001b[39m other\u001b[38;5;241m.\u001b[39mseq\u001b[38;5;241m.\u001b[39mindex(overlap_seq)\n\u001b[1;32m--> 194\u001b[0m overlap_length \u001b[38;5;241m=\u001b[39m \u001b[43moverlap\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m-\u001b[39m overlap[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msig\u001b[38;5;241m.\u001b[39mminhash\u001b[38;5;241m.\u001b[39mksize\n\u001b[0;32m 196\u001b[0m \u001b[38;5;66;03m# Since the overlapping k-mers were originally in 'other', not 'self' -> assign to 'other'\u001b[39;00m\n\u001b[0;32m 197\u001b[0m \u001b[38;5;66;03m# If we had used 'self', then would have returned ALL k-mers since they are all present in self\u001b[39;00m\n\u001b[0;32m 198\u001b[0m other\u001b[38;5;241m.\u001b[39moverlap \u001b[38;5;241m=\u001b[39m OverlapInfo(index, overlap_seq, overlap_encoded, overlap_length)\n",
+ "\u001b[1;31mIndexError\u001b[0m: list index out of range"
+ ]
+ }
+ ],
+ "source": [
+ "from sigseq import SigSeq\n",
+ "\n",
+ "for ksize, sigs in ced9_bcl2_sigs.items():\n",
+ " print(f\"ksize: {ksize}\")\n",
+ "\n",
+ " ced9_sigseq = SigSeq(sigs[\"ced9\"], ced9_seq)\n",
+ " bcl2_sigseq = SigSeq(sigs[\"bcl2\"], bcl2_seq)\n",
+ "\n",
+ " try:\n",
+ " ced9_sigseq.display_alignment(bcl2_sigseq)\n",
+ " except ValueError as e:\n",
+ " print(e)\n",
+ " continue"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "c422de6c",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "IndexError",
+ "evalue": "list index out of range",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[21], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mced9_sigseq\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdisplay_alignment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbcl2_sigseq\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:244\u001b[0m, in \u001b[0;36mSigSeq.display_alignment\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 242\u001b[0m \u001b[38;5;124;03m\"\"\"Displays the alignment between two sequences\"\"\"\u001b[39;00m\n\u001b[0;32m 243\u001b[0m \u001b[38;5;66;03m# Compute overlaps\u001b[39;00m\n\u001b[1;32m--> 244\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_overlap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 245\u001b[0m other\u001b[38;5;241m.\u001b[39mcompute_overlap(\u001b[38;5;28mself\u001b[39m)\n\u001b[0;32m 247\u001b[0m \u001b[38;5;66;03m# Verify overlaps\u001b[39;00m\n",
+ "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:194\u001b[0m, in \u001b[0;36mSigSeq.compute_overlap\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 191\u001b[0m overlap_encoded \u001b[38;5;241m=\u001b[39m KmerStitcher\u001b[38;5;241m.\u001b[39mstitch_kmers(overlap, use_encoded\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 193\u001b[0m index \u001b[38;5;241m=\u001b[39m other\u001b[38;5;241m.\u001b[39mseq\u001b[38;5;241m.\u001b[39mindex(overlap_seq)\n\u001b[1;32m--> 194\u001b[0m overlap_length \u001b[38;5;241m=\u001b[39m \u001b[43moverlap\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m-\u001b[39m overlap[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msig\u001b[38;5;241m.\u001b[39mminhash\u001b[38;5;241m.\u001b[39mksize\n\u001b[0;32m 196\u001b[0m \u001b[38;5;66;03m# Since the overlapping k-mers were originally in 'other', not 'self' -> assign to 'other'\u001b[39;00m\n\u001b[0;32m 197\u001b[0m \u001b[38;5;66;03m# If we had used 'self', then would have returned ALL k-mers since they are all present in self\u001b[39;00m\n\u001b[0;32m 198\u001b[0m other\u001b[38;5;241m.\u001b[39moverlap \u001b[38;5;241m=\u001b[39m OverlapInfo(index, overlap_seq, overlap_encoded, overlap_length)\n",
+ "\u001b[1;31mIndexError\u001b[0m: list index out of range"
+ ]
+ }
+ ],
+ "source": [
+ "ced9_sigseq.display_alignment(bcl2_sigseq)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "66214b65-33c1-4594-836d-0df045a8eb04",
+ "metadata": {},
+ "source": [
+ "### "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e4cef2f1-dd57-491b-993c-24b85415af46",
+ "metadata": {},
+ "source": [
+ "\n",
+ "## P66 and CD47\n",
+ "\n",
+ "From [P66 is a bacterial mimic of CD47 that binds the anti-phagocytic receptor SIRPα and facilitates macrophage evasion by Borrelia burgdorferi](https://www.biorxiv.org/content/10.1101/2024.04.29.591704v1.full)\n",
+ "\n",
+ "> Protein alignments were performed through Uniprot (www.uniprot.org) using the Clustal Omega Program23,24. The following proteins were used for analysis: CD47_HUMAN (Q08722), and H7C7N8(P66)_BORBU (H7C7N8).\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "bdb27e70-40c1-4aa8-981d-3aa234a5496b",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:34.148120Z",
+ "iopub.status.busy": "2024-11-12T21:31:34.147864Z",
+ "iopub.status.idle": "2024-11-12T21:31:34.150240Z",
+ "shell.execute_reply": "2024-11-12T21:31:34.149995Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:34.148107Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "CD47_HUMAN = \"MWPLVAALLLGSACCGSAQLLFNKTKSVEFTFCNDTVVIPCFVTNMEAQNTTEVYVKWKFKGRDIYTFDGALNKSTVPTDFSSAKIEVSQLLKGDASLKMDKSDAVSHTGNYTCEVTELTREGETIIELKYRVVSWFSPNENILIVIFPIFAILLFWGQFGIKTLKYRSGGMDEKTIALLVAGLVITVIVIVGAILFVPGEYSLKNATGLGLIVTSTGILILLHYYVFSTAIGLTSFVIAILVIQVIAYILAVVGLSLCIAACIPMHGPLLISGLSILALAQLLGLVYMKFVASNQKTIQPPRKAVEEPLNAFKESKGMMNDE\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "fe826dbe-334f-4038-8ef1-9682455e28f7",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:34.345613Z",
+ "iopub.status.busy": "2024-11-12T21:31:34.345364Z",
+ "iopub.status.idle": "2024-11-12T21:31:34.347806Z",
+ "shell.execute_reply": "2024-11-12T21:31:34.347546Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:34.345600Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "H7C7N8_BORBU = \"MKSHILYKLIIFLTTSAAIFAADALKEKDIFKINPWMPTFGFENTSEFRLDMDELVPGFENKSKITIKLKPFEANPELGKDDPFSAYIKVEDLALKAEGKKGDQFKIDVGDITAQINMYDFFIKISTMTDFDFNKESLFSFAPMTGFKSTYYGFPSNDRAVRGTILARGTSKNIGTIQLGYKLPKLDLTFAIGGTGTGNRNQENDKDTPYNKTYQGILYGIQATWKPIKNLLDQNEDTKSVIAETPFELNFGLSGAYGNETFNNSSITYSLKDKSVVGNDLLSPTLSNSAILASFGAKYKLGLTKINDKNTYLILQMGTDFGIDPFASDFSIFGHISKAANFKKETPSDPNKKAEIFDPNGNALNFSKNTELGIAFSTGASIGFAWNKDTGEKESWAIKGSDSYSTRLFGEQDKKSGVALGISYGQNLYRSKDTEKRLKTISENAFQSLNVEISSYEDNKKGIINGLGWITSIGLYDILRQKSVENYPTTISSTTENNQTEQSSTSTKTTTPNLTFEDAMKLGLALYLDYAIPIASISTEAYVVPYIGAYILGPSNKLSSDATKIYLKTGLSLEKLIRFTTISLGWDSNNIIELANKNTNNAAIGSAFLQFKIAYSGS\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "ad498f99-a8c2-403f-b827-c6065c02725a",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:34.529424Z",
+ "iopub.status.busy": "2024-11-12T21:31:34.529059Z",
+ "iopub.status.idle": "2024-11-12T21:31:34.653853Z",
+ "shell.execute_reply": "2024-11-12T21:31:34.653567Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:34.529412Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " query | \n",
+ " match | \n",
+ " moltype | \n",
+ " ksize | \n",
+ " jaccard | \n",
+ " query_n_kmers | \n",
+ " query_n_unique_kmers | \n",
+ " query_intersection_positions | \n",
+ " match_n_kmers | \n",
+ " match_n_unique_kmers | \n",
+ " match_intersection_positions | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " CD47 | \n",
+ " p66 | \n",
+ " dayhoff | \n",
+ " 5 | \n",
+ " 0.098266 | \n",
+ " 319 | \n",
+ " 269 | \n",
+ " [[2, beebb], [8, eebbb], [15, bbbce], [26, bec... | \n",
+ " 614 | \n",
+ " 491 | \n",
+ " [[9, eefeb], [10, efebb], [12, ebbbb], [14, bb... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CD47 | \n",
+ " p66 | \n",
+ " dayhoff | \n",
+ " 6 | \n",
+ " 0.026838 | \n",
+ " 318 | \n",
+ " 294 | \n",
+ " [[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83... | \n",
+ " 613 | \n",
+ " 586 | \n",
+ " [[9, eefebb], [75, bcebdc], [105, decebc], [10... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " CD47 | \n",
+ " p66 | \n",
+ " dayhoff | \n",
+ " 7 | \n",
+ " 0.006579 | \n",
+ " 317 | \n",
+ " 306 | \n",
+ " [[84, decebce], [105, ebdbbcf], [197, ebbcfbe]... | \n",
+ " 612 | \n",
+ " 612 | \n",
+ " [[105, decebce], [154, bbccdbe], [159, bedbbee... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " CD47 | \n",
+ " p66 | \n",
+ " dayhoff | \n",
+ " 8 | \n",
+ " 0.000000 | \n",
+ " 316 | \n",
+ " 314 | \n",
+ " [] | \n",
+ " 611 | \n",
+ " 611 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " CD47 | \n",
+ " p66 | \n",
+ " dayhoff | \n",
+ " 9 | \n",
+ " 0.000000 | \n",
+ " 315 | \n",
+ " 315 | \n",
+ " [] | \n",
+ " 610 | \n",
+ " 610 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " CD47 | \n",
+ " p66 | \n",
+ " dayhoff | \n",
+ " 10 | \n",
+ " 0.000000 | \n",
+ " 314 | \n",
+ " 314 | \n",
+ " [] | \n",
+ " 609 | \n",
+ " 609 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n",
+ "0 CD47 p66 dayhoff 5 0.098266 319 269 \n",
+ "1 CD47 p66 dayhoff 6 0.026838 318 294 \n",
+ "2 CD47 p66 dayhoff 7 0.006579 317 306 \n",
+ "3 CD47 p66 dayhoff 8 0.000000 316 314 \n",
+ "4 CD47 p66 dayhoff 9 0.000000 315 315 \n",
+ "5 CD47 p66 dayhoff 10 0.000000 314 314 \n",
+ "\n",
+ " query_intersection_positions match_n_kmers \\\n",
+ "0 [[2, beebb], [8, eebbb], [15, bbbce], [26, bec... 614 \n",
+ "1 [[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83... 613 \n",
+ "2 [[84, decebce], [105, ebdbbcf], [197, ebbcfbe]... 612 \n",
+ "3 [] 611 \n",
+ "4 [] 610 \n",
+ "5 [] 609 \n",
+ "\n",
+ " match_n_unique_kmers match_intersection_positions \n",
+ "0 491 [[9, eefeb], [10, efebb], [12, ebbbb], [14, bb... \n",
+ "1 586 [[9, eefebb], [75, bcebdc], [105, decebc], [10... \n",
+ "2 612 [[105, decebce], [154, bbccdbe], [159, bedbbee... \n",
+ "3 611 [] \n",
+ "4 610 [] \n",
+ "5 609 [] "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p66_cd47_df = compare_sequences(CD47_HUMAN, H7C7N8_BORBU, \"CD47\", \"p66\",end_k=10)\n",
+ "p66_cd47_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "5e162f4d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[[21, 'bbced'],\n",
+ " [27, 'febed'],\n",
+ " [28, 'ebedb'],\n",
+ " [32, 'bbcbb'],\n",
+ " [46, 'cebbb'],\n",
+ " [47, 'ebbbb'],\n",
+ " [48, 'bbbbd'],\n",
+ " [49, 'bbbdc'],\n",
+ " [50, 'bbdcb'],\n",
+ " [94, 'cbecf'],\n",
+ " [100, 'bbbbe'],\n",
+ " [101, 'bbbeb'],\n",
+ " [111, 'cdcee'],\n",
+ " [137, 'eebeb'],\n",
+ " [167, 'fbdee'],\n",
+ " [168, 'bdeeb'],\n",
+ " [171, 'ebeeb'],\n",
+ " [177, 'bbfeb'],\n",
+ " [185, 'ecbec'],\n",
+ " [217, 'cdbfc'],\n",
+ " [252, 'beebb'],\n",
+ " [256, 'bbebb'],\n",
+ " [262, 'bebee'],\n",
+ " [264, 'beebe']]"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kmer5p66cd47intersection = df.iloc[0]['query_intersection_positions']\n",
+ "kmer5p66cd47intersection"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "451bc42d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[[10, 'cdcee'],\n",
+ " [24, 'cdbfc'],\n",
+ " [33, 'cebbb'],\n",
+ " [34, 'ebbbb'],\n",
+ " [43, 'bbbbe'],\n",
+ " [49, 'bbcbb'],\n",
+ " [58, 'bbbbd'],\n",
+ " [59, 'bbbdc'],\n",
+ " [60, 'bbdcb'],\n",
+ " [82, 'bbbeb'],\n",
+ " [83, 'bbebb'],\n",
+ " [115, 'bbced'],\n",
+ " [139, 'cbecf'],\n",
+ " [143, 'fbdee'],\n",
+ " [144, 'bdeeb'],\n",
+ " [158, 'ecbec'],\n",
+ " [213, 'febed'],\n",
+ " [214, 'ebedb'],\n",
+ " [218, 'beebe'],\n",
+ " [219, 'eebeb'],\n",
+ " [221, 'bebee'],\n",
+ " [222, 'ebeeb'],\n",
+ " [223, 'beebb'],\n",
+ " [232, 'bbfeb']]"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kmer5p66cd47match = df.iloc[0]['match_intersection_positions']\n",
+ "kmer5p66cd47match"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "dc528452",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[[27, 'febedb'],\n",
+ " [46, 'cebbbb'],\n",
+ " [48, 'bbbbdc'],\n",
+ " [49, 'bbbdcb'],\n",
+ " [100, 'bbbbeb'],\n",
+ " [167, 'fbdeeb'],\n",
+ " [255, 'bbbebb'],\n",
+ " [262, 'bebeeb']]"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kmer6p66cd47intersection = df.iloc[1]['query_intersection_positions']\n",
+ "kmer6p66cd47intersection"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "55df9052",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[[33, 'cebbbb'],\n",
+ " [58, 'bbbbdc'],\n",
+ " [59, 'bbbdcb'],\n",
+ " [81, 'bbbbeb'],\n",
+ " [82, 'bbbebb'],\n",
+ " [143, 'fbdeeb'],\n",
+ " [213, 'febedb'],\n",
+ " [221, 'bebeeb']]"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kmer6p66cd47match = df.iloc[1]['match_intersection_positions']\n",
+ "kmer6p66cd47match"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "42f65950",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b827bc3f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ade0f902",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c0f17735",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f26d57c-29bc-4c04-8dd8-48fab6bf5fb3",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-11T18:37:59.846385Z",
+ "iopub.status.busy": "2024-11-11T18:37:59.846101Z",
+ "iopub.status.idle": "2024-11-11T18:37:59.849548Z",
+ "shell.execute_reply": "2024-11-11T18:37:59.849227Z",
+ "shell.execute_reply.started": "2024-11-11T18:37:59.846369Z"
+ }
+ },
+ "source": [
+ "\n",
+ "### Q: Where does P66 bind CD47? A: 181-187aa\n",
+ "\n",
+ "> Utilizing a p66-deficient B. burgdorferi strain of B31-A3 (Δp66) we determined that P66 is required for CV1-G4 surface binding (Figure 2A). We next sought to determine residues on P66 critical for SIRPɑ interaction. We have previously demonstrated that two aspartic acid residues, D184 and D186, on a predicted extracellular loop of P66 (181–187) are required for integrin binding19. B. burgdorferi expressing the mutant D184A and D186A, p66D184A,D186A, or loss of the loop, p66Δ181−187, demonstrated loss of CV1-G4 binding (Figure 2A). Consistent to previous structure predictions, these sites map to an unstructured extracellular loop on a structure of P66 generated by Alphafold2 (Figure 2B and Extended Data 2A). We postulate this region is also required for SIRPɑ binding. Importantly, while these residues are critical for binding integrins and P66, loss of this loop or mutation of the two aspartic acid residues does not affect P66 cell surface localization19.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e085b44e",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "b4a2745a-b8e5-4d42-9df3-261775255ad6",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:34.921046Z",
+ "iopub.status.busy": "2024-11-12T21:31:34.920724Z",
+ "iopub.status.idle": "2024-11-12T21:31:34.924464Z",
+ "shell.execute_reply": "2024-11-12T21:31:34.924221Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:34.921033Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[True,\n",
+ " True,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False,\n",
+ " False]"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p66_binding_positions = set(range(181, 188))\n",
+ "\n",
+ "overlapping_p66_cd47_mimickry = [\n",
+ " True if any(i in p66_binding_positions for i, kmer in positions) else False\n",
+ " for positions in p66_cd47_df.match_intersection_positions\n",
+ "]\n",
+ "overlapping_p66_cd47_mimickry"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "8aa13116-0402-4764-803e-76119148babc",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:35.503641Z",
+ "iopub.status.busy": "2024-11-12T21:31:35.503391Z",
+ "iopub.status.idle": "2024-11-12T21:31:35.539428Z",
+ "shell.execute_reply": "2024-11-12T21:31:35.539170Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:35.503628Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " query | \n",
+ " match | \n",
+ " moltype | \n",
+ " ksize | \n",
+ " jaccard | \n",
+ " query_n_kmers | \n",
+ " query_n_unique_kmers | \n",
+ " query_intersection_positions | \n",
+ " match_n_kmers | \n",
+ " match_n_unique_kmers | \n",
+ " match_intersection_positions | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " CD47 | \n",
+ " p66 | \n",
+ " dayhoff | \n",
+ " 5 | \n",
+ " 0.098266 | \n",
+ " 319 | \n",
+ " 269 | \n",
+ " [[2, beebb], [8, eebbb], [15, bbbce], [26, bec... | \n",
+ " 614 | \n",
+ " 491 | \n",
+ " [[9, eefeb], [10, efebb], [12, ebbbb], [14, bb... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CD47 | \n",
+ " p66 | \n",
+ " dayhoff | \n",
+ " 6 | \n",
+ " 0.026838 | \n",
+ " 318 | \n",
+ " 294 | \n",
+ " [[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83... | \n",
+ " 613 | \n",
+ " 586 | \n",
+ " [[9, eefebb], [75, bcebdc], [105, decebc], [10... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n",
+ "0 CD47 p66 dayhoff 5 0.098266 319 269 \n",
+ "1 CD47 p66 dayhoff 6 0.026838 318 294 \n",
+ "\n",
+ " query_intersection_positions match_n_kmers \\\n",
+ "0 [[2, beebb], [8, eebbb], [15, bbbce], [26, bec... 614 \n",
+ "1 [[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83... 613 \n",
+ "\n",
+ " match_n_unique_kmers match_intersection_positions \n",
+ "0 491 [[9, eefeb], [10, efebb], [12, ebbbb], [14, bb... \n",
+ "1 586 [[9, eefebb], [75, bcebdc], [105, decebc], [10... "
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p66_cd47_df.loc[overlapping_p66_cd47_mimickry]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "63e5771c-e1f5-452a-8417-1ba9cc8164d6",
+ "metadata": {},
+ "source": [
+ "### Make Sourmash signatures for P66, CD47"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "762ae606-f888-4a85-97a6-0b45e61634e1",
+ "metadata": {},
+ "source": [
+ "#### CD47"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "dd8f56ea-bf46-42b5-a4ce-58916b1212f3",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:36.388801Z",
+ "iopub.status.busy": "2024-11-12T21:31:36.388484Z",
+ "iopub.status.idle": "2024-11-12T21:31:36.391508Z",
+ "shell.execute_reply": "2024-11-12T21:31:36.391259Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:36.388788Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Overwriting test-data/cd47.fasta\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%file test-data/cd47.fasta\n",
+ ">CD47_HUMAN\n",
+ "MWPLVAALLLGSACCGSAQLLFNKTKSVEFTFCNDTVVIPCFVTNMEAQNTTEVYVKWKFKGRDIYTFDGALNKSTVPTDFSSAKIEVSQLLKGDASLKMDKSDAVSHTGNYTCEVTELTREGETIIELKYRVVSWFSPNENILIVIFPIFAILLFWGQFGIKTLKYRSGGMDEKTIALLVAGLVITVIVIVGAILFVPGEYSLKNATGLGLIVTSTGILILLHYYVFSTAIGLTSFVIAILVIQVIAYILAVVGLSLCIAACIPMHGPLLISGLSILALAQLLGLVYMKFVASNQKTIQPPRKAVEEPLNAFKESKGMMNDE"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "655714be-f3f0-40ac-8641-96eae10c6e2e",
+ "metadata": {},
+ "source": [
+ "#### P66"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "1aaff219-c98e-4c5d-b057-534703f64485",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:31:37.163245Z",
+ "iopub.status.busy": "2024-11-12T21:31:37.162832Z",
+ "iopub.status.idle": "2024-11-12T21:31:37.165660Z",
+ "shell.execute_reply": "2024-11-12T21:31:37.165403Z",
+ "shell.execute_reply.started": "2024-11-12T21:31:37.163232Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Overwriting test-data/p66.fasta\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%file test-data/p66.fasta\n",
+ ">P66_H7C7N8_BORBU\n",
+ "MKSHILYKLIIFLTTSAAIFAADALKEKDIFKINPWMPTFGFENTSEFRLDMDELVPGFENKSKITIKLKPFEANPELGKDDPFSAYIKVEDLALKAEGKKGDQFKIDVGDITAQINMYDFFIKISTMTDFDFNKESLFSFAPMTGFKSTYYGFPSNDRAVRGTILARGTSKNIGTIQLGYKLPKLDLTFAIGGTGTGNRNQENDKDTPYNKTYQGILYGIQATWKPIKNLLDQNEDTKSVIAETPFELNFGLSGAYGNETFNNSSITYSLKDKSVVGNDLLSPTLSNSAILASFGAKYKLGLTKINDKNTYLILQMGTDFGIDPFASDFSIFGHISKAANFKKETPSDPNKKAEIFDPNGNALNFSKNTELGIAFSTGASIGFAWNKDTGEKESWAIKGSDSYSTRLFGEQDKKSGVALGISYGQNLYRSKDTEKRLKTISENAFQSLNVEISSYEDNKKGIINGLGWITSIGLYDILRQKSVENYPTTISSTTENNQTEQSSTSTKTTTPNLTFEDAMKLGLALYLDYAIPIASISTEAYVVPYIGAYILGPSNKLSSDATKIYLKTGLSLEKLIRFTTISLGWDSNNIIELANKNTNNAAIGSAFLQFKIAYSGS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c2986b10-c40f-407d-ba7e-c221db770da2",
+ "metadata": {},
+ "source": [
+ "### Compute signatures"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b150efd8",
+ "metadata": {},
+ "source": [
+ "# This value was dropped from ksizes = 8, 9 in the hp dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 149,
+ "id": "ca31427f-04bd-4304-b3a4-b9f28e976760",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:35:43.660315Z",
+ "iopub.status.busy": "2024-11-12T21:35:43.659921Z",
+ "iopub.status.idle": "2024-11-12T21:35:45.842797Z",
+ "shell.execute_reply": "2024-11-12T21:35:45.842366Z",
+ "shell.execute_reply.started": "2024-11-12T21:35:43.660302Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\u001b[K\n",
+ "== This is sourmash version 4.8.12. ==\n",
+ "\n",
+ "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n",
+ "\n",
+ "\n",
+ "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n",
+ "\n",
+ "\u001b[Kcomputing signatures for files: test-data/p66.fasta\n",
+ "\n",
+ "\u001b[KComputing a total of 1 signature(s) for each input.\n",
+ "\n",
+ "\u001b[K... reading sequences from test-data/p66.fasta\n",
+ "\n",
+ "\u001b[K... test-data/p66.fasta 1 sequences\n",
+ "\n",
+ "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n",
+ "\n",
+ "\u001b[Ksaved 1 signature(s) to 'test-data/p66.dayhoff.k3.sig'\n",
+ "\n",
+ "\u001b[K\n",
+ "== This is sourmash version 4.8.12. ==\n",
+ "\n",
+ "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n",
+ "\n",
+ "\n",
+ "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n",
+ "\n",
+ "\u001b[Kcomputing signatures for files: test-data/cd47.fasta\n",
+ "\n",
+ "\u001b[KComputing a total of 1 signature(s) for each input.\n",
+ "\n",
+ "\u001b[K... reading sequences from test-data/cd47.fasta\n",
+ "\n",
+ "\u001b[K... test-data/cd47.fasta 1 sequences\n",
+ "\n",
+ "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n",
+ "\n",
+ "\u001b[Ksaved 1 signature(s) to 'test-data/cd47.dayhoff.k3.sig'\n",
+ "\n",
+ "\u001b[K\n",
+ "== This is sourmash version 4.8.12. ==\n",
+ "\n",
+ "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n",
+ "\n",
+ "\n",
+ "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n",
+ "\n",
+ "\u001b[Kcomputing signatures for files: test-data/p66.fasta\n",
+ "\n",
+ "\u001b[KComputing a total of 1 signature(s) for each input.\n",
+ "\n",
+ "\u001b[K... reading sequences from test-data/p66.fasta\n",
+ "\n",
+ "\u001b[K... test-data/p66.fasta 1 sequences\n",
+ "\n",
+ "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n",
+ "\n",
+ "\u001b[Ksaved 1 signature(s) to 'test-data/p66.dayhoff.k4.sig'\n",
+ "\n",
+ "\u001b[K\n",
+ "== This is sourmash version 4.8.12. ==\n",
+ "\n",
+ "\u001b[K== Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==\n",
+ "\n",
+ "\n",
+ "\u001b[KWARNING: scaled value should be >= 100. Continuing anyway.\n",
+ "\n",
+ "\u001b[Kcomputing signatures for files: test-data/cd47.fasta\n",
+ "\n",
+ "\u001b[KComputing a total of 1 signature(s) for each input.\n",
+ "\n",
+ "\u001b[K... reading sequences from test-data/cd47.fasta\n",
+ "\n",
+ "\u001b[K... test-data/cd47.fasta 1 sequences\n",
+ "\n",
+ "\u001b[Kcalculated 1 signature for 1 sequences taken from 1 files\n",
+ "\n",
+ "\u001b[Ksaved 1 signature(s) to 'test-data/cd47.dayhoff.k4.sig'\n"
+ ]
+ }
+ ],
+ "source": [
+ "ksizes = 3, 4\n",
+ "p66_cd47_fastas = {\"p66\": \"test-data/p66.fasta\", \"cd47\": \"test-data/cd47.fasta\"}\n",
+ "p66_cd47_sigfiles = {}\n",
+ "\n",
+ "\n",
+ "for ksize in ksizes:\n",
+ " for name, fasta in p66_cd47_fastas.items():\n",
+ " param_string = f\"dayhoff,scaled=1,k={ksize}\"\n",
+ " sig = f\"test-data/{name}.dayhoff.k{ksize}.sig\"\n",
+ " ! sourmash sketch protein -p $param_string --name $name $fasta -o $sig\n",
+ " p66_cd47_sigfiles[(name, ksize)] = sig"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6274314c-56e2-4041-86ea-8d252f50e35e",
+ "metadata": {},
+ "source": [
+ "### Load Signatures"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "510e0d98",
+ "metadata": {},
+ "source": [
+ "# This value was dropped from ksizes = 8, 9 in the hp dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 169,
+ "id": "90f21140-c447-4880-8aa8-7ef2cbc8f917",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:35:47.715555Z",
+ "iopub.status.busy": "2024-11-12T21:35:47.715175Z",
+ "iopub.status.idle": "2024-11-12T21:35:47.721686Z",
+ "shell.execute_reply": "2024-11-12T21:35:47.721466Z",
+ "shell.execute_reply.started": "2024-11-12T21:35:47.715538Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{3: {'p66': SourmashSignature('p66', 06f1c759),\n",
+ " 'cd47': SourmashSignature('cd47', bafcd8ea)},\n",
+ " 4: {'p66': SourmashSignature('p66', 06f1c759),\n",
+ " 'cd47': SourmashSignature('cd47', bafcd8ea)}}"
+ ]
+ },
+ "execution_count": 169,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ksizes = 3, 4\n",
+ "p66_cd47_sigs = dict.fromkeys(ksizes, {})\n",
+ "\n",
+ "for (name, ksize), sigfile in p66_cd47_sigfiles.items():\n",
+ " p66_cd47_sigs[ksize][name] = list(sourmash.load_file_as_signatures(sigfile))[0]\n",
+ "p66_cd47_sigs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a46a7223-9c07-4d04-96cd-946e263c2cfb",
+ "metadata": {},
+ "source": [
+ "### SHow SigSeq Alignment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "id": "15a5a246-9fe3-4db8-afd1-1bd140c62918",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:56:40.338811Z",
+ "iopub.status.busy": "2024-11-12T21:56:40.338426Z",
+ "iopub.status.idle": "2024-11-12T21:56:40.355520Z",
+ "shell.execute_reply": "2024-11-12T21:56:40.355182Z",
+ "shell.execute_reply.started": "2024-11-12T21:56:40.338798Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ksize: 3\n",
+ "Non-sequential indices -- Previous: 0, current: 2\n",
+ "ksize: 4\n",
+ "Non-sequential indices -- Previous: 0, current: 2\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sigseq import SigSeq\n",
+ "\n",
+ "for ksize, sigs in p66_cd47_sigs.items():\n",
+ " print(f\"ksize: {ksize}\")\n",
+ "\n",
+ " p66_sigseq = SigSeq(sigs[\"p66\"], H7C7N8_BORBU)\n",
+ " cd47_sigseq = SigSeq(sigs[\"cd47\"], CD47_HUMAN)\n",
+ "\n",
+ " try:\n",
+ " p66_sigseq.display_alignment(cd47_sigseq)\n",
+ " except ValueError as e:\n",
+ " print(e)\n",
+ " continue"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 171,
+ "id": "00497e2e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sigseq import KmerStitcher, FastaHeaderHighlighter\n",
+ "import sigseq"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 172,
+ "id": "4fefdefa",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[(6, 'fdee', 'YKLI', 10068566436590771491),\n",
+ " (8, 'eeef', 'LIIF', 224116582751336810),\n",
+ " (9, 'eefe', 'IIFL', 7860449462917742889),\n",
+ " (10, 'efeb', 'IFLT', 4466143379625637480),\n",
+ " (11, 'febb', 'FLTT', 22856495822854259),\n",
+ " (12, 'ebbb', 'LTTS', 6489788892270137876),\n",
+ " (13, 'bbbb', 'TTSA', 10636871753394754362),\n",
+ " (14, 'bbbb', 'TSAA', 10636871753394754362),\n",
+ " (15, 'bbbe', 'SAAI', 5844270199162658202),\n",
+ " (17, 'befb', 'AIFA', 12608447144368280135),\n",
+ " (18, 'efbb', 'IFAA', 6737117628605503971),\n",
+ " (19, 'fbbc', 'FAAD', 15487073495104592163),\n",
+ " (21, 'bcbe', 'ADAL', 2582718285239245651),\n",
+ " (23, 'bedc', 'ALKE', 9924122599953948134),\n",
+ " (27, 'dcef', 'KDIF', 13066860520591833755),\n",
+ " (32, 'ecbf', 'INPW', 7824081998478575779),\n",
+ " (35, 'febb', 'WMPT', 22856495822854259),\n",
+ " (36, 'ebbf', 'MPTF', 3777468906224825823),\n",
+ " (39, 'fbfc', 'FGFE', 1717429840091522704),\n",
+ " (42, 'ccbb', 'ENTS', 7934574478782340536),\n",
+ " (43, 'cbbc', 'NTSE', 13425603466974449662),\n",
+ " (44, 'bbcf', 'TSEF', 401197479395011469),\n",
+ " (48, 'dece', 'RLDM', 3514259266920402271),\n",
+ " (52, 'ccee', 'DELV', 14823416379075295469),\n",
+ " (53, 'ceeb', 'ELVP', 4220580037854917126),\n",
+ " (54, 'eebb', 'LVPG', 11038159954814081015),\n",
+ " (55, 'ebbf', 'VPGF', 3777468906224825823),\n",
+ " (59, 'ccdb', 'ENKS', 9515149052504803760),\n",
+ " (60, 'cdbd', 'NKSK', 627200266029661451),\n",
+ " (64, 'ebed', 'ITIK', 10110588346040050718),\n",
+ " (65, 'bede', 'TIKL', 8778380644404774298),\n",
+ " (70, 'bfcb', 'PFEA', 18351374587090038902),\n",
+ " (72, 'cbcb', 'EANP', 12503683351264325607),\n",
+ " (75, 'bceb', 'PELG', 15690868526339966161),\n",
+ " (76, 'cebd', 'ELGK', 6011042865859074878),\n",
+ " (77, 'ebdc', 'LGKD', 18271496327093918616),\n",
+ " (84, 'bbfe', 'SAYI', 6002404200797057331),\n",
+ " (87, 'edec', 'IKVE', 2606773150647828518),\n",
+ " (92, 'ebed', 'LALK', 10110588346040050718),\n",
+ " (93, 'bedb', 'ALKA', 13946213461756747690),\n",
+ " (94, 'edbc', 'LKAE', 578033708912463536),\n",
+ " (95, 'dbcb', 'KAEG', 7885097756694372784),\n",
+ " (98, 'bddb', 'GKKG', 7218742382433715284),\n",
+ " (105, 'dece', 'KIDV', 3514259266920402271),\n",
+ " (106, 'eceb', 'IDVG', 9509177043812954048),\n",
+ " (107, 'cebc', 'DVGD', 10454519246483044435),\n",
+ " (108, 'ebce', 'VGDI', 4015064277273641161),\n",
+ " (109, 'bceb', 'GDIT', 15690868526339966161),\n",
+ " (111, 'ebbc', 'ITAQ', 5727016641903521905),\n",
+ " (112, 'bbce', 'TAQI', 4231916625216208266),\n",
+ " (113, 'bcec', 'AQIN', 18103179462109539146),\n",
+ " (124, 'ebbe', 'ISTM', 6920393162873654046),\n",
+ " (125, 'bbeb', 'STMT', 10595816954080188374),\n",
+ " (126, 'bebc', 'TMTD', 18364677344105950525),\n",
+ " (136, 'befb', 'SLFS', 12608447144368280135),\n",
+ " (137, 'efbf', 'LFSF', 15041436218241961998),\n",
+ " (141, 'bbeb', 'APMT', 10595816954080188374),\n",
+ " (142, 'bebb', 'PMTG', 12348475939575077495),\n",
+ " (143, 'ebbf', 'MTGF', 3777468906224825823),\n",
+ " (146, 'fdbb', 'FKST', 17601202462413201014),\n",
+ " (149, 'bffb', 'TYYG', 493115665276685844),\n",
+ " (153, 'fbbc', 'FPSN', 15487073495104592163),\n",
+ " (154, 'bbcc', 'PSND', 8851499734005910348),\n",
+ " (155, 'bccd', 'SNDR', 5203889812942446964),\n",
+ " (156, 'ccdb', 'NDRA', 9515149052504803760),\n",
+ " (157, 'cdbe', 'DRAV', 7606399555099723201),\n",
+ " (158, 'dbed', 'RAVR', 110875883647815593),\n",
+ " (159, 'bedb', 'AVRG', 13946213461756747690),\n",
+ " (160, 'edbb', 'VRGT', 10559932379304644073),\n",
+ " (161, 'dbbe', 'RGTI', 14964367747587985732),\n",
+ " (162, 'bbee', 'GTIL', 15322539958464395896),\n",
+ " (163, 'beeb', 'TILA', 17475210263719431112),\n",
+ " (165, 'ebdb', 'LARG', 12564816384096126964),\n",
+ " (166, 'bdbb', 'ARGT', 14842221203416155138),\n",
+ " (167, 'dbbb', 'RGTS', 17726373358396768861),\n",
+ " (168, 'bbbd', 'GTSK', 10427112140463483717),\n",
+ " (170, 'bdce', 'SKNI', 17335298721270894545),\n",
+ " (173, 'ebbe', 'IGTI', 6920393162873654046),\n",
+ " (174, 'bbec', 'GTIQ', 4064199507453608151),\n",
+ " (176, 'eceb', 'IQLG', 9509177043812954048),\n",
+ " (183, 'bdec', 'PKLD', 2259711356554022352),\n",
+ " (184, 'dece', 'KLDL', 3514259266920402271),\n",
+ " (185, 'eceb', 'LDLT', 9509177043812954048),\n",
+ " (190, 'bebb', 'AIGG', 12348475939575077495),\n",
+ " (191, 'ebbb', 'IGGT', 6489788892270137876),\n",
+ " (192, 'bbbb', 'GGTG', 10636871753394754362),\n",
+ " (193, 'bbbb', 'GTGT', 10636871753394754362),\n",
+ " (194, 'bbbb', 'TGTG', 10636871753394754362),\n",
+ " (195, 'bbbc', 'GTGN', 10936565803043742378),\n",
+ " (205, 'dcbb', 'KDTP', 11732771003953160801),\n",
+ " (209, 'fcdb', 'YNKT', 8777467295491174931),\n",
+ " (212, 'bfcb', 'TYQG', 18351374587090038902),\n",
+ " (214, 'cbee', 'QGIL', 3021297321754295892),\n",
+ " (215, 'beef', 'GILY', 207159318435450593),\n",
+ " (216, 'eefb', 'ILYG', 12562983450975136318),\n",
+ " (217, 'efbe', 'LYGI', 6323445226374325209),\n",
+ " (219, 'becb', 'GIQA', 16513065919568703664),\n",
+ " (220, 'ecbb', 'IQAT', 1139172134765890002),\n",
+ " (225, 'dbed', 'KPIK', 110875883647815593),\n",
+ " (226, 'bedc', 'PIKN', 9924122599953948134),\n",
+ " (230, 'eecc', 'LLDQ', 8067965132495951993),\n",
+ " (231, 'eccc', 'LDQN', 18222561503088214725),\n",
+ " (236, 'cbdb', 'DTKS', 7294566287391320156),\n",
+ " (237, 'bdbe', 'TKSV', 11859816197542080665),\n",
+ " (238, 'dbee', 'KSVI', 6222755038673932181),\n",
+ " (239, 'beeb', 'SVIA', 17475210263719431112),\n",
+ " (242, 'bcbb', 'AETP', 13143451112721385487),\n",
+ " (248, 'ecfb', 'LNFG', 4312308046274198431),\n",
+ " (249, 'cfbe', 'NFGL', 6246859079940908248),\n",
+ " (251, 'bebb', 'GLSG', 12348475939575077495),\n",
+ " (252, 'ebbb', 'LSGA', 6489788892270137876),\n",
+ " (257, 'bccb', 'GNET', 2481900653225094471),\n",
+ " (262, 'ccbb', 'NNSS', 7934574478782340536),\n",
+ " (263, 'cbbe', 'NSSI', 16279479833324286872),\n",
+ " (264, 'bbeb', 'SSIT', 10595816954080188374),\n",
+ " (268, 'fbed', 'YSLK', 9666214415774811527),\n",
+ " (269, 'bedc', 'SLKD', 9924122599953948134),\n",
+ " (272, 'cdbe', 'DKSV', 7606399555099723201),\n",
+ " (273, 'dbee', 'KSVV', 6222755038673932181),\n",
+ " (274, 'beeb', 'SVVG', 17475210263719431112),\n",
+ " (278, 'ccee', 'NDLL', 14823416379075295469),\n",
+ " (279, 'ceeb', 'DLLS', 4220580037854917126),\n",
+ " (280, 'eebb', 'LLSP', 11038159954814081015),\n",
+ " (281, 'ebbb', 'LSPT', 6489788892270137876),\n",
+ " (282, 'bbbe', 'SPTL', 5844270199162658202),\n",
+ " (283, 'bbeb', 'PTLS', 10595816954080188374),\n",
+ " (284, 'bebc', 'TLSN', 18364677344105950525),\n",
+ " (286, 'bcbb', 'SNSA', 13143451112721385487),\n",
+ " (287, 'cbbe', 'NSAI', 16279479833324286872),\n",
+ " (288, 'bbee', 'SAIL', 15322539958464395896),\n",
+ " (289, 'beeb', 'AILA', 17475210263719431112),\n",
+ " (290, 'eebb', 'ILAS', 11038159954814081015),\n",
+ " (291, 'ebbf', 'LASF', 3777468906224825823),\n",
+ " (297, 'dfde', 'KYKL', 6306512417307503239),\n",
+ " (300, 'ebeb', 'LGLT', 13416682157814255285),\n",
+ " (301, 'bebd', 'GLTK', 16657858857931383438),\n",
+ " (303, 'bdec', 'TKIN', 2259711356554022352),\n",
+ " (305, 'eccd', 'INDK', 2362080192152468741),\n",
+ " (310, 'bfee', 'TYLI', 10718531886035662971),\n",
+ " (312, 'eeec', 'LILQ', 3327879637431277366),\n",
+ " (313, 'eece', 'ILQM', 3038027948003595164),\n",
+ " (314, 'eceb', 'LQMG', 9509177043812954048),\n",
+ " (316, 'ebbc', 'MGTD', 5727016641903521905),\n",
+ " (317, 'bbcf', 'GTDF', 401197479395011469),\n",
+ " (318, 'bcfb', 'TDFG', 2564856895474494016),\n",
+ " (319, 'cfbe', 'DFGI', 6246859079940908248),\n",
+ " (321, 'becb', 'GIDP', 16513065919568703664),\n",
+ " (322, 'ecbf', 'IDPF', 7824081998478575779),\n",
+ " (325, 'fbbc', 'FASD', 15487073495104592163),\n",
+ " (326, 'bbcf', 'ASDF', 401197479395011469),\n",
+ " (327, 'bcfb', 'SDFS', 2564856895474494016),\n",
+ " (328, 'cfbe', 'DFSI', 6246859079940908248),\n",
+ " (329, 'fbef', 'FSIF', 15409343862723143238),\n",
+ " (330, 'befb', 'SIFG', 12608447144368280135),\n",
+ " (335, 'ebdb', 'ISKA', 12564816384096126964),\n",
+ " (336, 'bdbb', 'SKAA', 14842221203416155138),\n",
+ " (337, 'dbbc', 'KAAN', 5358982605686529698),\n",
+ " (338, 'bbcf', 'AANF', 401197479395011469),\n",
+ " (343, 'dcbb', 'KETP', 11732771003953160801),\n",
+ " (344, 'cbbb', 'ETPS', 7151776787376566250),\n",
+ " (345, 'bbbc', 'TPSD', 10936565803043742378),\n",
+ " (353, 'bcef', 'AEIF', 3701223976780487860),\n",
+ " (357, 'cbcb', 'DPNG', 12503683351264325607),\n",
+ " (359, 'cbcb', 'NGNA', 12503683351264325607),\n",
+ " (360, 'bcbe', 'GNAL', 2582718285239245651),\n",
+ " (361, 'cbec', 'NALN', 12306494840529946604),\n",
+ " (362, 'becf', 'ALNF', 11450466598554388674),\n",
+ " (363, 'ecfb', 'LNFS', 4312308046274198431),\n",
+ " (366, 'bdcb', 'SKNT', 15142247617731513045),\n",
+ " (367, 'dcbc', 'KNTE', 13393529472101385437),\n",
+ " (369, 'bceb', 'TELG', 15690868526339966161),\n",
+ " (371, 'ebeb', 'LGIA', 13416682157814255285),\n",
+ " (375, 'fbbb', 'FSTG', 12785923314063975223),\n",
+ " (376, 'bbbb', 'STGA', 10636871753394754362),\n",
+ " (377, 'bbbb', 'TGAS', 10636871753394754362),\n",
+ " (378, 'bbbe', 'GASI', 5844270199162658202),\n",
+ " (379, 'bbeb', 'ASIG', 10595816954080188374),\n",
+ " (383, 'fbfc', 'FAWN', 1717429840091522704),\n",
+ " (387, 'dcbb', 'KDTG', 11732771003953160801),\n",
+ " (388, 'cbbc', 'DTGE', 13425603466974449662),\n",
+ " (395, 'fbed', 'WAIK', 9666214415774811527),\n",
+ " (396, 'bedb', 'AIKG', 13946213461756747690),\n",
+ " (397, 'edbb', 'IKGS', 10559932379304644073),\n",
+ " (398, 'dbbc', 'KGSD', 5358982605686529698),\n",
+ " (404, 'bbde', 'STRL', 13444773003568638791),\n",
+ " (409, 'bccc', 'GEQD', 1677271796314223088),\n",
+ " (414, 'dbbe', 'KSGV', 14964367747587985732),\n",
+ " (415, 'bbeb', 'SGVA', 10595816954080188374),\n",
+ " (416, 'bebe', 'GVAL', 9185312005446482276),\n",
+ " (417, 'ebeb', 'VALG', 13416682157814255285),\n",
+ " (418, 'bebe', 'ALGI', 9185312005446482276),\n",
+ " (419, 'ebeb', 'LGIS', 13416682157814255285),\n",
+ " (428, 'fdbd', 'YRSK', 7815796655124299643),\n",
+ " (429, 'dbdc', 'RSKD', 6008455418557613888),\n",
+ " (430, 'bdcb', 'SKDT', 15142247617731513045),\n",
+ " (431, 'dcbc', 'KDTE', 13393529472101385437),\n",
+ " (437, 'edbe', 'LKTI', 3379849564839441876),\n",
+ " (438, 'dbeb', 'KTIS', 2245165869925555499),\n",
+ " (439, 'bebc', 'TISE', 18364677344105950525),\n",
+ " (441, 'bccb', 'SENA', 2481900653225094471),\n",
+ " (444, 'bfcb', 'AFQS', 18351374587090038902),\n",
+ " (446, 'cbec', 'QSLN', 12306494840529946604),\n",
+ " (450, 'eceb', 'VEIS', 9509177043812954048),\n",
+ " (452, 'ebbf', 'ISSY', 3777468906224825823),\n",
+ " (459, 'ddbe', 'KKGI', 17379983601788100176),\n",
+ " (460, 'dbee', 'KGII', 6222755038673932181),\n",
+ " (461, 'beec', 'GIIN', 8654096532145016621),\n",
+ " (464, 'cbeb', 'NGLG', 17993562543865718127),\n",
+ " (466, 'ebfe', 'LGWI', 14933920156901041087),\n",
+ " (468, 'febb', 'WITS', 22856495822854259),\n",
+ " (469, 'ebbe', 'ITSI', 6920393162873654046),\n",
+ " (470, 'bbeb', 'TSIG', 10595816954080188374),\n",
+ " (471, 'bebe', 'SIGL', 9185312005446482276),\n",
+ " (476, 'ceed', 'DILR', 14964024876258418521),\n",
+ " (480, 'cdbe', 'QKSV', 7606399555099723201),\n",
+ " (481, 'dbec', 'KSVE', 15521531363153442454),\n",
+ " (482, 'becc', 'SVEN', 16775746449511419550),\n",
+ " (485, 'cfbb', 'NYPT', 10413764088404445244),\n",
+ " (486, 'fbbb', 'YPTT', 12785923314063975223),\n",
+ " (487, 'bbbe', 'PTTI', 5844270199162658202),\n",
+ " (488, 'bbeb', 'TTIS', 10595816954080188374),\n",
+ " (489, 'bebb', 'TISS', 12348475939575077495),\n",
+ " (490, 'ebbb', 'ISST', 6489788892270137876),\n",
+ " (491, 'bbbb', 'SSTT', 10636871753394754362),\n",
+ " (492, 'bbbc', 'STTE', 10936565803043742378),\n",
+ " (493, 'bbcc', 'TTEN', 8851499734005910348),\n",
+ " (494, 'bccc', 'TENN', 1677271796314223088),\n",
+ " (498, 'cbcc', 'QTEQ', 9144737173124411667),\n",
+ " (499, 'bccb', 'TEQS', 2481900653225094471),\n",
+ " (500, 'ccbb', 'EQSS', 7934574478782340536),\n",
+ " (501, 'cbbb', 'QSST', 7151776787376566250),\n",
+ " (502, 'bbbb', 'SSTS', 10636871753394754362),\n",
+ " (503, 'bbbb', 'STST', 10636871753394754362),\n",
+ " (504, 'bbbd', 'TSTK', 10427112140463483717),\n",
+ " (506, 'bdbb', 'TKTT', 14842221203416155138),\n",
+ " (507, 'dbbb', 'KTTT', 17726373358396768861),\n",
+ " (508, 'bbbb', 'TTTP', 10636871753394754362),\n",
+ " (509, 'bbbc', 'TTPN', 10936565803043742378),\n",
+ " (510, 'bbce', 'TPNL', 4231916625216208266),\n",
+ " (511, 'bceb', 'PNLT', 15690868526339966161),\n",
+ " (516, 'ccbe', 'EDAM', 17994569987617823751),\n",
+ " (518, 'bede', 'AMKL', 8778380644404774298),\n",
+ " (521, 'ebeb', 'LGLA', 13416682157814255285),\n",
+ " (522, 'bebe', 'GLAL', 9185312005446482276),\n",
+ " (527, 'ecfb', 'LDYA', 4312308046274198431),\n",
+ " (528, 'cfbe', 'DYAI', 6246859079940908248),\n",
+ " (530, 'bebe', 'AIPI', 9185312005446482276),\n",
+ " (531, 'ebeb', 'IPIA', 13416682157814255285),\n",
+ " (532, 'bebb', 'PIAS', 12348475939575077495),\n",
+ " (533, 'ebbe', 'IASI', 6920393162873654046),\n",
+ " (534, 'bbeb', 'ASIS', 10595816954080188374),\n",
+ " (535, 'bebb', 'SIST', 12348475939575077495),\n",
+ " (536, 'ebbc', 'ISTE', 5727016641903521905),\n",
+ " (540, 'bfee', 'AYVV', 10718531886035662971),\n",
+ " (541, 'feeb', 'YVVP', 3909355865613333976),\n",
+ " (542, 'eebf', 'VVPY', 4195867088387656031),\n",
+ " (543, 'ebfe', 'VPYI', 14933920156901041087),\n",
+ " (545, 'febb', 'YIGA', 22856495822854259),\n",
+ " (546, 'ebbf', 'IGAY', 3777468906224825823),\n",
+ " (547, 'bbfe', 'GAYI', 6002404200797057331),\n",
+ " (548, 'bfee', 'AYIL', 10718531886035662971),\n",
+ " (549, 'feeb', 'YILG', 3909355865613333976),\n",
+ " (550, 'eebb', 'ILGP', 11038159954814081015),\n",
+ " (551, 'ebbb', 'LGPS', 6489788892270137876),\n",
+ " (552, 'bbbc', 'GPSN', 10936565803043742378),\n",
+ " (557, 'ebbc', 'LSSD', 5727016641903521905),\n",
+ " (559, 'bcbb', 'SDAT', 13143451112721385487),\n",
+ " (560, 'cbbd', 'DATK', 15963831736868780672),\n",
+ " (561, 'bbde', 'ATKI', 13444773003568638791),\n",
+ " (564, 'efed', 'IYLK', 3007946504019006955),\n",
+ " (566, 'edbb', 'LKTG', 10559932379304644073),\n",
+ " (567, 'dbbe', 'KTGL', 14964367747587985732),\n",
+ " (568, 'bbeb', 'TGLS', 10595816954080188374),\n",
+ " (569, 'bebe', 'GLSL', 9185312005446482276),\n",
+ " (571, 'becd', 'SLEK', 6470706253879962510),\n",
+ " (575, 'eedf', 'LIRF', 974291162461474114),\n",
+ " (579, 'bbeb', 'TTIS', 10595816954080188374),\n",
+ " (580, 'bebe', 'TISL', 9185312005446482276),\n",
+ " (581, 'ebeb', 'ISLG', 13416682157814255285),\n",
+ " (584, 'bfcb', 'GWDS', 18351374587090038902),\n",
+ " (586, 'cbcc', 'DSNN', 9144737173124411667),\n",
+ " (588, 'ccee', 'NNII', 14823416379075295469),\n",
+ " (590, 'eece', 'IIEL', 3038027948003595164),\n",
+ " (591, 'eceb', 'IELA', 9509177043812954048),\n",
+ " (592, 'cebc', 'ELAN', 10454519246483044435),\n",
+ " (596, 'dcbc', 'KNTN', 13393529472101385437),\n",
+ " (597, 'cbcc', 'NTNN', 9144737173124411667),\n",
+ " (598, 'bccb', 'TNNA', 2481900653225094471),\n",
+ " (599, 'ccbb', 'NNAA', 7934574478782340536),\n",
+ " (600, 'cbbe', 'NAAI', 16279479833324286872),\n",
+ " (601, 'bbeb', 'AAIG', 10595816954080188374),\n",
+ " (602, 'bebb', 'AIGS', 12348475939575077495),\n",
+ " (603, 'ebbb', 'IGSA', 6489788892270137876),\n",
+ " (605, 'bbfe', 'SAFL', 6002404200797057331),\n",
+ " (614, 'fbbb', 'YSGS', 12785923314063975223)]"
+ ]
+ },
+ "execution_count": 172,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "self = cd47_sigseq\n",
+ "other = p66_sigseq\n",
+ "overlap = self.get_overlapping_kmers(other)\n",
+ "overlap"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a1d01285",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 173,
+ "id": "27ae655c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ksize: 3\n",
+ "ksize: 4\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sigseq import SigSeq\n",
+ "\n",
+ "for ksize, sigs in p66_cd47_sigs.items():\n",
+ " print(f\"ksize: {ksize}\")\n",
+ "\n",
+ " p66_sigseq = SigSeq(sigs[\"p66\"], H7C7N8_BORBU)\n",
+ " cd47_sigseq = SigSeq(sigs[\"cd47\"], CD47_HUMAN)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 174,
+ "id": "adfc75d4",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "ValueError",
+ "evalue": "Non-sequential indices -- Previous: 0, current: 2",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[174], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mp66_sigseq\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdisplay_alignment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcd47_sigseq\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:244\u001b[0m, in \u001b[0;36mSigSeq.display_alignment\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 242\u001b[0m \u001b[38;5;124;03m\"\"\"Displays the alignment between two sequences\"\"\"\u001b[39;00m\n\u001b[0;32m 243\u001b[0m \u001b[38;5;66;03m# Compute overlaps\u001b[39;00m\n\u001b[1;32m--> 244\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_overlap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 245\u001b[0m other\u001b[38;5;241m.\u001b[39mcompute_overlap(\u001b[38;5;28mself\u001b[39m)\n\u001b[0;32m 247\u001b[0m \u001b[38;5;66;03m# Verify overlaps\u001b[39;00m\n",
+ "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:190\u001b[0m, in \u001b[0;36mSigSeq.compute_overlap\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;124;03m\"\"\"Computes overlap information between two sequences\"\"\"\u001b[39;00m\n\u001b[0;32m 189\u001b[0m overlap \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_overlapping_kmers(other)\n\u001b[1;32m--> 190\u001b[0m overlap_seq \u001b[38;5;241m=\u001b[39m \u001b[43mKmerStitcher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstitch_kmers\u001b[49m\u001b[43m(\u001b[49m\u001b[43moverlap\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_encoded\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m 191\u001b[0m overlap_encoded \u001b[38;5;241m=\u001b[39m KmerStitcher\u001b[38;5;241m.\u001b[39mstitch_kmers(overlap, use_encoded\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 193\u001b[0m index \u001b[38;5;241m=\u001b[39m other\u001b[38;5;241m.\u001b[39mseq\u001b[38;5;241m.\u001b[39mindex(overlap_seq)\n",
+ "File \u001b[1;32m~\\2024-kmerseek-analysis\\notebooks\\sigseq.py:95\u001b[0m, in \u001b[0;36mKmerStitcher.stitch_kmers\u001b[1;34m(cls, overlap, use_encoded)\u001b[0m\n\u001b[0;32m 93\u001b[0m prev_i \u001b[38;5;241m=\u001b[39m i\n\u001b[0;32m 94\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m---> 95\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 96\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNon-sequential indices -- Previous: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mprev_i\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, current: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 99\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m stitched\n",
+ "\u001b[1;31mValueError\u001b[0m: Non-sequential indices -- Previous: 0, current: 2"
+ ]
+ }
+ ],
+ "source": [
+ "p66_sigseq.display_alignment(cd47_sigseq)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 175,
+ "id": "32b99e31-5344-4529-acb6-8d8f1392db95",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:44:13.803198Z",
+ "iopub.status.busy": "2024-11-12T21:44:13.802750Z",
+ "iopub.status.idle": "2024-11-12T21:44:13.805865Z",
+ "shell.execute_reply": "2024-11-12T21:44:13.805585Z",
+ "shell.execute_reply.started": "2024-11-12T21:44:13.803185Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'edbdeefdeeefebbbbbefbbcbedcdcefdecbfebbfbfccbbcfdececceebbfccdbdebededbfcbcbcebdccbfbbfedeccebedbcbddbccfdecebcebbcecefcffedebbebcfcfcdcbefbfbbebbfdbbffbfbbccdbedbbeebdbbbdcebbecebfdebdecebfbebbbbbbcdcccccdcbbfcdbfcbeefbecbbfdbedceecccccbdbeebcbbfcecfbebbbfbccbfccbbebfbedcdbeebcceebbbebcbbeebbfbbdfdebebdeccdcbfeeecebbcfbecbfbbcfbefbdebdbbcfddcbbbcbcddbcefcbcbcbecfbdcbcebebfbbbbbebfbfcdcbbcdcbfbedbbcbfbbdefbcccddbbebebebfbccefdbdcbcddedbebccbfcbececebbfcccddbeecbebfebbebefceedcdbeccfbbbebbbbccccbccbbbbbdbbbbcebfccbedebebefecfbebebbebbcbfeebfebbfeebbbcdebbcbbdefedbbebecdeedfbbebebfcbcceecebcdcbccbbebbbfecfdebfbbb'"
+ ]
+ },
+ "execution_count": 175,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p66_sigseq.seq_encoded"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 176,
+ "id": "69305af8-39ea-49b1-a307-b49e8d8d31a7",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:49:30.134639Z",
+ "iopub.status.busy": "2024-11-12T21:49:30.134314Z",
+ "iopub.status.idle": "2024-11-12T21:49:30.137414Z",
+ "shell.execute_reply": "2024-11-12T21:49:30.137124Z",
+ "shell.execute_reply.started": "2024-11-12T21:49:30.134625Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'MKSHILYKLIIFLTTSAAIFAADALKEKDIFKINPWMPTFGFENTSEFRLDMDELVPGFENKSKITIKLKPFEANPELGKDDPFSAYIKVEDLALKAEGKKGDQFKIDVGDITAQINMYDFFIKISTMTDFDFNKESLFSFAPMTGFKSTYYGFPSNDRAVRGTILARGTSKNIGTIQLGYKLPKLDLTFAIGGTGTGNRNQENDKDTPYNKTYQGILYGIQATWKPIKNLLDQNEDTKSVIAETPFELNFGLSGAYGNETFNNSSITYSLKDKSVVGNDLLSPTLSNSAILASFGAKYKLGLTKINDKNTYLILQMGTDFGIDPFASDFSIFGHISKAANFKKETPSDPNKKAEIFDPNGNALNFSKNTELGIAFSTGASIGFAWNKDTGEKESWAIKGSDSYSTRLFGEQDKKSGVALGISYGQNLYRSKDTEKRLKTISENAFQSLNVEISSYEDNKKGIINGLGWITSIGLYDILRQKSVENYPTTISSTTENNQTEQSSTSTKTTTPNLTFEDAMKLGLALYLDYAIPIASISTEAYVVPYIGAYILGPSNKLSSDATKIYLKTGLSLEKLIRFTTISLGWDSNNIIELANKNTNNAAIGSAFLQFKIAYSGS'"
+ ]
+ },
+ "execution_count": 176,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p66_sigseq.seq"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 177,
+ "id": "645c4216-7ee5-4962-b06c-b37766854596",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:47:28.023690Z",
+ "iopub.status.busy": "2024-11-12T21:47:28.023361Z",
+ "iopub.status.idle": "2024-11-12T21:47:28.026602Z",
+ "shell.execute_reply": "2024-11-12T21:47:28.026319Z",
+ "shell.execute_reply.started": "2024-11-12T21:47:28.023677Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4"
+ ]
+ },
+ "execution_count": 177,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p66_sigseq.sig.minhash.ksize"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 178,
+ "id": "63267a8a-09c4-4678-8e62-ab32e8e73d54",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:47:29.130940Z",
+ "iopub.status.busy": "2024-11-12T21:47:29.130609Z",
+ "iopub.status.idle": "2024-11-12T21:47:29.133676Z",
+ "shell.execute_reply": "2024-11-12T21:47:29.133391Z",
+ "shell.execute_reply.started": "2024-11-12T21:47:29.130926Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'efbeebbeeebbbaabbbceefcdbdbecfbfaccbeeebafebcecbccbbcefedfdfdbdcefbfcbbecdbbebbcfbbbdecebceedbcbbedecdbcbebdbbcfbacebcebdcbcbeecedfdeebffbbccceeeeefbefbeeeffbcfbedbedfdbbbeccdbebeeebbeeebeeeeebbeefebbcfbedcbbbebeeebbbbeeeeedffefbbbebebbfeebeeeeceebfeebeebebeaebbaebedbbeeebbebeebebceebeefedfebbccdbecbbddbeccbecbfdcbdbeeccc'"
+ ]
+ },
+ "execution_count": 178,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cd47_sigseq.seq_encoded"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 179,
+ "id": "6231db67-2862-45b2-b4eb-a29dbb6ad13e",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:49:34.422710Z",
+ "iopub.status.busy": "2024-11-12T21:49:34.422392Z",
+ "iopub.status.idle": "2024-11-12T21:49:34.425372Z",
+ "shell.execute_reply": "2024-11-12T21:49:34.425098Z",
+ "shell.execute_reply.started": "2024-11-12T21:49:34.422696Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'MWPLVAALLLGSACCGSAQLLFNKTKSVEFTFCNDTVVIPCFVTNMEAQNTTEVYVKWKFKGRDIYTFDGALNKSTVPTDFSSAKIEVSQLLKGDASLKMDKSDAVSHTGNYTCEVTELTREGETIIELKYRVVSWFSPNENILIVIFPIFAILLFWGQFGIKTLKYRSGGMDEKTIALLVAGLVITVIVIVGAILFVPGEYSLKNATGLGLIVTSTGILILLHYYVFSTAIGLTSFVIAILVIQVIAYILAVVGLSLCIAACIPMHGPLLISGLSILALAQLLGLVYMKFVASNQKTIQPPRKAVEEPLNAFKESKGMMNDE'"
+ ]
+ },
+ "execution_count": 179,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cd47_sigseq.seq"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 180,
+ "id": "9cc02e40-6770-4a2c-9305-1a5553982b03",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:48:18.872968Z",
+ "iopub.status.busy": "2024-11-12T21:48:18.872639Z",
+ "iopub.status.idle": "2024-11-12T21:48:18.875221Z",
+ "shell.execute_reply": "2024-11-12T21:48:18.874842Z",
+ "shell.execute_reply.started": "2024-11-12T21:48:18.872954Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "columns = [\"i\", \"dayhoff\", \"protein\", \"hashval\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 181,
+ "id": "a59d434e-77b3-47c6-ab2a-0224dd0fbfb6",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:48:19.021119Z",
+ "iopub.status.busy": "2024-11-12T21:48:19.020791Z",
+ "iopub.status.idle": "2024-11-12T21:48:19.042976Z",
+ "shell.execute_reply": "2024-11-12T21:48:19.042669Z",
+ "shell.execute_reply.started": "2024-11-12T21:48:19.021105Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(295, 4)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " i | \n",
+ " dayhoff | \n",
+ " protein | \n",
+ " hashval | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 6 | \n",
+ " fdee | \n",
+ " YKLI | \n",
+ " 10068566436590771491 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 8 | \n",
+ " eeef | \n",
+ " LIIF | \n",
+ " 224116582751336810 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 9 | \n",
+ " eefe | \n",
+ " IIFL | \n",
+ " 7860449462917742889 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 10 | \n",
+ " efeb | \n",
+ " IFLT | \n",
+ " 4466143379625637480 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 11 | \n",
+ " febb | \n",
+ " FLTT | \n",
+ " 22856495822854259 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " i dayhoff protein hashval\n",
+ "0 6 fdee YKLI 10068566436590771491\n",
+ "1 8 eeef LIIF 224116582751336810\n",
+ "2 9 eefe IIFL 7860449462917742889\n",
+ "3 10 efeb IFLT 4466143379625637480\n",
+ "4 11 febb FLTT 22856495822854259"
+ ]
+ },
+ "execution_count": 181,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p66_in_cd47_overlap = pd.DataFrame(\n",
+ " cd47_sigseq.get_overlapping_kmers(p66_sigseq), columns=columns\n",
+ ")\n",
+ "print(p66_in_cd47_overlap.shape)\n",
+ "p66_in_cd47_overlap.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 182,
+ "id": "491e094e-ddfa-40ea-bc95-8400bf59c122",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:48:39.916599Z",
+ "iopub.status.busy": "2024-11-12T21:48:39.916278Z",
+ "iopub.status.idle": "2024-11-12T21:48:39.920665Z",
+ "shell.execute_reply": "2024-11-12T21:48:39.920373Z",
+ "shell.execute_reply.started": "2024-11-12T21:48:39.916585Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dayhoff\n",
+ "bbeb 12\n",
+ "bbbb 11\n",
+ "ebeb 7\n",
+ "bebb 7\n",
+ "bebe 7\n",
+ " ..\n",
+ "eccc 1\n",
+ "cbdb 1\n",
+ "bdbe 1\n",
+ "dfde 1\n",
+ "eedf 1\n",
+ "Name: count, Length: 127, dtype: int64"
+ ]
+ },
+ "execution_count": 182,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p66_in_cd47_overlap.dayhoff.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 183,
+ "id": "4fe2d889-119c-48ca-bd00-542f9138c833",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:48:19.197833Z",
+ "iopub.status.busy": "2024-11-12T21:48:19.197509Z",
+ "iopub.status.idle": "2024-11-12T21:48:19.209901Z",
+ "shell.execute_reply": "2024-11-12T21:48:19.209603Z",
+ "shell.execute_reply.started": "2024-11-12T21:48:19.197819Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(199, 4)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " i | \n",
+ " dayhoff | \n",
+ " protein | \n",
+ " hashval | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " efbe | \n",
+ " MWPL | \n",
+ " 6323445226374325209 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " beeb | \n",
+ " PLVA | \n",
+ " 17475210263719431112 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " eebb | \n",
+ " LVAA | \n",
+ " 11038159954814081015 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " ebbe | \n",
+ " VAAL | \n",
+ " 6920393162873654046 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " bbee | \n",
+ " AALL | \n",
+ " 15322539958464395896 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " i dayhoff protein hashval\n",
+ "0 0 efbe MWPL 6323445226374325209\n",
+ "1 2 beeb PLVA 17475210263719431112\n",
+ "2 3 eebb LVAA 11038159954814081015\n",
+ "3 4 ebbe VAAL 6920393162873654046\n",
+ "4 5 bbee AALL 15322539958464395896"
+ ]
+ },
+ "execution_count": 183,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cd47_in_p66_overlap = pd.DataFrame(\n",
+ " p66_sigseq.get_overlapping_kmers(cd47_sigseq), columns=columns\n",
+ ")\n",
+ "print(cd47_in_p66_overlap.shape)\n",
+ "cd47_in_p66_overlap.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 184,
+ "id": "0b40f3d1-28b7-436f-882d-66d8312ac6d8",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:48:33.116250Z",
+ "iopub.status.busy": "2024-11-12T21:48:33.115922Z",
+ "iopub.status.idle": "2024-11-12T21:48:33.120505Z",
+ "shell.execute_reply": "2024-11-12T21:48:33.120208Z",
+ "shell.execute_reply.started": "2024-11-12T21:48:33.116237Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dayhoff\n",
+ "eebb 6\n",
+ "bbee 5\n",
+ "bebe 5\n",
+ "bcfb 4\n",
+ "ebce 4\n",
+ " ..\n",
+ "bede 1\n",
+ "bcbb 1\n",
+ "edbc 1\n",
+ "ceed 1\n",
+ "eccc 1\n",
+ "Name: count, Length: 127, dtype: int64"
+ ]
+ },
+ "execution_count": 184,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cd47_in_p66_overlap.dayhoff.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 185,
+ "id": "9d888649-eb01-452d-ba52-825a119cdc3b",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T21:52:02.519716Z",
+ "iopub.status.busy": "2024-11-12T21:52:02.519502Z",
+ "iopub.status.idle": "2024-11-12T21:52:02.526996Z",
+ "shell.execute_reply": "2024-11-12T21:52:02.526710Z",
+ "shell.execute_reply.started": "2024-11-12T21:52:02.519703Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " i_x | \n",
+ " dayhoff | \n",
+ " protein_x | \n",
+ " hashval | \n",
+ " i_y | \n",
+ " protein_y | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 6 | \n",
+ " fdee | \n",
+ " YKLI | \n",
+ " 10068566436590771491 | \n",
+ " 130 | \n",
+ " YRVV | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 8 | \n",
+ " eeef | \n",
+ " LIIF | \n",
+ " 224116582751336810 | \n",
+ " 144 | \n",
+ " IVIF | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 8 | \n",
+ " eeef | \n",
+ " LIIF | \n",
+ " 224116582751336810 | \n",
+ " 152 | \n",
+ " ILLF | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 9 | \n",
+ " eefe | \n",
+ " IIFL | \n",
+ " 7860449462917742889 | \n",
+ " 194 | \n",
+ " ILFV | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 9 | \n",
+ " eefe | \n",
+ " IIFL | \n",
+ " 7860449462917742889 | \n",
+ " 285 | \n",
+ " LVYM | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 529 | \n",
+ " 603 | \n",
+ " ebbb | \n",
+ " IGSA | \n",
+ " 6489788892270137876 | \n",
+ " 9 | \n",
+ " LGSA | \n",
+ "
\n",
+ " \n",
+ " 530 | \n",
+ " 603 | \n",
+ " ebbb | \n",
+ " IGSA | \n",
+ " 6489788892270137876 | \n",
+ " 213 | \n",
+ " VTST | \n",
+ "
\n",
+ " \n",
+ " 531 | \n",
+ " 605 | \n",
+ " bbfe | \n",
+ " SAFL | \n",
+ " 6002404200797057331 | \n",
+ " 234 | \n",
+ " TSFV | \n",
+ "
\n",
+ " \n",
+ " 532 | \n",
+ " 614 | \n",
+ " fbbb | \n",
+ " YSGS | \n",
+ " 12785923314063975223 | \n",
+ " 80 | \n",
+ " FSSA | \n",
+ "
\n",
+ " \n",
+ " 533 | \n",
+ " 614 | \n",
+ " fbbb | \n",
+ " YSGS | \n",
+ " 12785923314063975223 | \n",
+ " 227 | \n",
+ " FSTA | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
534 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " i_x dayhoff protein_x hashval i_y protein_y\n",
+ "0 6 fdee YKLI 10068566436590771491 130 YRVV\n",
+ "1 8 eeef LIIF 224116582751336810 144 IVIF\n",
+ "2 8 eeef LIIF 224116582751336810 152 ILLF\n",
+ "3 9 eefe IIFL 7860449462917742889 194 ILFV\n",
+ "4 9 eefe IIFL 7860449462917742889 285 LVYM\n",
+ ".. ... ... ... ... ... ...\n",
+ "529 603 ebbb IGSA 6489788892270137876 9 LGSA\n",
+ "530 603 ebbb IGSA 6489788892270137876 213 VTST\n",
+ "531 605 bbfe SAFL 6002404200797057331 234 TSFV\n",
+ "532 614 fbbb YSGS 12785923314063975223 80 FSSA\n",
+ "533 614 fbbb YSGS 12785923314063975223 227 FSTA\n",
+ "\n",
+ "[534 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 185,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p66_in_cd47_overlap.merge(cd47_in_p66_overlap, on=[\"dayhoff\", \"hashval\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c6f2af40-562a-4bf6-98b1-b1c8f9e83105",
+ "metadata": {},
+ "source": [
+ "### Where is jaccard > 0.5?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c03d1328",
+ "metadata": {},
+ "source": [
+ "# This value was dropped from jaccard > 0.7 in the hp dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 191,
+ "id": "2af67f6f-51f5-4ed1-97ac-b0bf4f03a76d",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T19:00:34.689250Z",
+ "iopub.status.busy": "2024-11-12T19:00:34.689105Z",
+ "iopub.status.idle": "2024-11-12T19:00:34.754032Z",
+ "shell.execute_reply": "2024-11-12T19:00:34.753551Z",
+ "shell.execute_reply.started": "2024-11-12T19:00:34.689238Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " query | \n",
+ " match | \n",
+ " moltype | \n",
+ " ksize | \n",
+ " jaccard | \n",
+ " query_n_kmers | \n",
+ " query_n_unique_kmers | \n",
+ " query_intersection_positions | \n",
+ " match_n_kmers | \n",
+ " match_n_unique_kmers | \n",
+ " match_intersection_positions | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " CD47 | \n",
+ " p66 | \n",
+ " dayhoff | \n",
+ " 5 | \n",
+ " 0.098266 | \n",
+ " 319 | \n",
+ " 269 | \n",
+ " [[2, beebb], [8, eebbb], [15, bbbce], [26, bec... | \n",
+ " 614 | \n",
+ " 491 | \n",
+ " [[9, eefeb], [10, efebb], [12, ebbbb], [14, bb... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CD47 | \n",
+ " p66 | \n",
+ " dayhoff | \n",
+ " 6 | \n",
+ " 0.026838 | \n",
+ " 318 | \n",
+ " 294 | \n",
+ " [[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83... | \n",
+ " 613 | \n",
+ " 586 | \n",
+ " [[9, eefebb], [75, bcebdc], [105, decebc], [10... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " CD47 | \n",
+ " p66 | \n",
+ " dayhoff | \n",
+ " 7 | \n",
+ " 0.006579 | \n",
+ " 317 | \n",
+ " 306 | \n",
+ " [[84, decebce], [105, ebdbbcf], [197, ebbcfbe]... | \n",
+ " 612 | \n",
+ " 612 | \n",
+ " [[105, decebce], [154, bbccdbe], [159, bedbbee... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " query match moltype ksize jaccard query_n_kmers query_n_unique_kmers \\\n",
+ "0 CD47 p66 dayhoff 5 0.098266 319 269 \n",
+ "1 CD47 p66 dayhoff 6 0.026838 318 294 \n",
+ "2 CD47 p66 dayhoff 7 0.006579 317 306 \n",
+ "\n",
+ " query_intersection_positions match_n_kmers \\\n",
+ "0 [[2, beebb], [8, eebbb], [15, bbbce], [26, bec... 614 \n",
+ "1 [[46, cbccbb], [74, bbebbc], [76, ebbcfb], [83... 613 \n",
+ "2 [[84, decebce], [105, ebdbbcf], [197, ebbcfbe]... 612 \n",
+ "\n",
+ " match_n_unique_kmers match_intersection_positions \n",
+ "0 491 [[9, eefeb], [10, efebb], [12, ebbbb], [14, bb... \n",
+ "1 586 [[9, eefebb], [75, bcebdc], [105, decebc], [10... \n",
+ "2 612 [[105, decebce], [154, bbccdbe], [159, bedbbee... "
+ ]
+ },
+ "execution_count": 191,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p66_cd47_df.query(\"jaccard > 0.001\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 192,
+ "id": "894951f6-8e71-4f7d-8ac6-6e75b3472427",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T19:00:34.754744Z",
+ "iopub.status.busy": "2024-11-12T19:00:34.754593Z",
+ "iopub.status.idle": "2024-11-12T19:00:34.862007Z",
+ "shell.execute_reply": "2024-11-12T19:00:34.861660Z",
+ "shell.execute_reply.started": "2024-11-12T19:00:34.754732Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 192,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAANAAAADQCAYAAAB2pO90AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAPXUlEQVR4nO3dfZBddX3H8fdnQ2BpEiCkSYAiDRkQJAMNdQVqQREq3XYqVYxAKSP2YYAZmNCJnYq1AxFrp4+hFGhHpkDjA9XYWk1HBGlKxWqKWSBCl6iEGGhsyBOBPNiFTfbbP85v15vlZnP2/vbs3Xv385q5s3vPw93vmdnP3HPP/Z3fVxGBmTWmo9kFmLUyB8gsgwNklsEBMsvgAJllaJsAdXd3B+CHH1U96mqbAG3fvr3ZJdgkVGmAJHVL+r6k9ZJurrP+HZKelLRP0qJh666R9Fx6XFNlnWaNqixAkqYAdwO/ApwB/IakM4Zt9iLwIeCBYfseC9wKnAucA9wqaWZVtZo1qsp3oHOA9RGxISJeBz4P/HrtBhGxMSKeBgaG7fvLwCMR8XJE7AQeAborrNWsIVUG6GeA/6l5viktG7N9JV0rqUdSz7Zt297wIgMDwYZte1j9/HY2bNvDwMBBPwuaNeSwZheQIyLuAe4B6OrqOiAdAwPBQ70vsWTFWvr6B+ic2sGyyxfSveA4OjrUlHqt/VT5DvQj4E01z09My6reF4CNO/YOhQegr3+AJSvWsnHH3tG8jNmIqgzQGuBUSSdLOhy4ElhZct+HgUskzUwXDy5Jy0rbsqtvKDyD+voH2Lq7bzQvYzaiygIUEfuAGyn+8dcBKyKiV9Jtki4FkPQ2SZuADwCfktSb9n0Z+ARFCNcAt6Vlpc09qpPOqQceXufUDubM6Mw8MrOfULvcD9TV1RU9PT1Dz/0ZyMZY3X+alr6IMJKODtG94DhOX3wBW3f3MWdGJ/NmTXN4bEy1bYCgCNH82dOZP3t6s0uxNtU2Y+HMmsEBMsvgAJllcIDMMjhAZhkcILMMDpBZBgfILIMDZJbBATLL4ACZZXCAzDI4QGYZHCCzDA6QWQYHyCyDA2SWwQEyy+AAmWVodneGIyR9Ia1/XNK8tHyqpOWSnpG0TtJHq6zTrFHN7s7wO8DOiDgFuB34s7T8A8AREXEm8FbgusFwmU0kTe3OkJ4vT7//E3CxJFF0BJsm6TDgSOB1YFeFtZo1pNndGYa2STOZvgrMogjTXmAzRQ+hv6w3M+mhujOYVW2iXkQ4B9gPnACcDHxY0vzhG0XEPRHRFRFds2fPHu8azZrenWFom3S6djSwA7gKeCgi+iNiK/AtoKvCWs0a0uzuDCuBwf6ni4B/j2Ky7heBiwAkTQPOA75XYa1mDWlqdwbgXmCWpPXAEmDwUvfdwPTUrWENcH9qBWk2obRtdwazMVa3K8FEvYhg1hIcILMMDpBZBgfILIMDZJbBATLL4ACZZXCAzDI4QGYZHCCzDA6QWQYHyCyDA2SWwQEyy+AAmWVwgMwyOEBmGRwgswwOkFkGB8gsgwNklmFCdmdI686StFpSb+rS0FllrWaNmJDdGdIspZ8Fro+IBcCFQH9VtZo1aqJ2Z7gEeDoivgsQETsiYn+FtZo1ZKJ2Z3gzEJIelvSkpD+osE6zhh12sBWS7qTo01NXRCyupKLCYcD5wNuAHwOrJD0REauG1XgtcC3ASSedVGE5ZvWN9A7UAzwBdAI/DzyXHguBw0u8dk53hk3AYxGxPSJ+DDyYajiA25tYsx00QBGxPCKWA2cBF0bEnRFxJ3AxRYgOJac7w8PAmZJ+KgXrncCzozgus3Fx0FO4GjOBo4DBDnHT07IRRcQ+SYPdGaYA9w12ZwB6ImIlRXeGz6TuDC9ThIyI2ClpGUUIA3gwIr46ukMzq94huzNI+i1gKfAoxQz17wCWpnenCcPdGaxidbszjPgOJKkD+D5wbnoAfCQiXhrb2sxa04gBiogBSXdHxNnAV8apJrOWUeZ7oFWS3p++4DSzGmUCdB3wReA1Sbsk7Za0q+K6zFrCIa/CRcSM8SjErBWVuYyNpJnAqRRfqgIQEY9VVZRZqzhkgCT9LnATxUiCtRQt51eT2tCbTWZlPgPdRDEm7YWIeBdwNvBKlUWZtYoyAeqLiD4oboCLiO8Bp1VblllrKPMZaJOkY4AvA49I2gm8UGVRZq2izFW496Vfl0p6lGLE9EOVVmXWIg55CifpPEkzACLiG8B/UHwOMpv0ynwG+jtgT83zPWmZ2aRXJkCKmiHbETFAye+PzNpdmQBtkLRY0tT0uAnYUHVhZq2gTICuB95Ocfv1JorbGq6tsiizVlHmKtxW0p2iZnagMlfhlqfvgQafz5R0X6VVmbWIMqdwZ0XEK4NPImInvoxtBpQLUEcajQ2ApGPxVTgzoFwQ/gpYLemLFBMrLAI+WWlVZi2izEWET0t6AnhXWnRZRHiONjNKzo0dEb3ACoqJEPdIKjWPbk57k7T+JEl7JP1+mb9nNt7KXIW7VNJzwA+BbwAbga+V2K/h9iY1lpX5W2bNUuYd6BMUd6H+ICJOppja979K7JfT3gRJ76UIbW+Jv2XWFGUC1B8ROyiuxnVExKNAV4n9Gm5vImk68BHg4yP9AUnXSuqR1LNt27YSJZmNrTIBeiX9Q38T+JykO4C91ZbFUuD2iNgz0kbuzmDNVuYy9qVAH8XcCFdTTDQ/4jtDMpr2JpuGtTc5F1gk6c+BY4ABSX0RcVeJv2s2bkZqsPWfEXE+sIWfNNoanJ30jyW9DPxFRPztQV5iqL0JRVCuBK4ats1ge5PVHNje5IKaOpYCexwem4gOGqAUnoNOrChpFvBtoG6ActqbmLWKQ7Y3GXFn6fiI2DyG9TTM7U2sYnXnhs9qMjxRwmPWLFV26TZrew6QWQYHyCyDA2SWwQEyy+AAmWVwgMwyOEBmGRwgswwOkFmGSTk91cBAsHHHXrbs6mPuUZ3MmzWNjo66Q53MRjTpAjQwEDzU+xJLVqylr3+AzqkdLLt8Id0LjnOIbNQm3Sncxh17h8ID0Nc/wJIVa9m4o+qbbK0dTboAbdnVNxSeQX39A2zd3dekiqyVTboAzT2qk86pBx5259QO5szobFJF1somXYDmzZrGsssXDoVo8DPQvFnTmlyZtaJJdxGho0N0LziO0xdfwNbdfcyZ4atw1rhJFyAoQjR/9nTmz57e7FKsxU26UzizseQAmWWoNECNdmeQ9G5JT0h6Jv28qMo6zRpVWYAyuzNsB94TEWdSTLz4marqNMtR5TtQw90ZIuKpiPjftLwXOFLSERXWataQKgPUcHeGYdu8H3gyIl4b/gfcncGabUJfRJC0gOK07rp6692dwZqtygCNpjsDw7ozIOlE4F+AD0bE8xXWadawKgM01J1B0uEUE8evHLbNYHcGqOnOIOkY4KvAzRHxrQprNMtSWYDSZ5rB7gzrgBWD3RkkXZo2u5eiI916YAkweKn7RuAU4BZJa9NjTlW1mjUqqzvDROLuDFaxse/OYDbZOUBmGRwgswwOkFkGB8gsgwNklsEBMsvgAJllcIDMMjhAZhkcILMMk3Jaq3rcscEa4QDhjg3WOJ/C4Y4N1jgHCHdssMY5QLhjgzXOAcIdG6xxvoiAOzZY4xygxB0brBE+hTPL4ACZZaj0FE5SN3AHMAX4+4j402HrjwA+DbyVYkLFKyJiY1r3UYrJ5/cDiyPi4SprPZh6IxSAuqMWvG37bFv2829lAarpzvBuinmx10haGRHP1mw21J1B0pUU0/hekbo4XAksAE4A/k3SmyNif1X11lNvhMJdV53N6/viDaMWLnnLXL6+bou3bYNtRzMKZUJ2Z0jLPx8Rr0XED4H16fXGVb0RCk9verXuqIXezW9c7m1bc9vRjEKZqN0ZyuxbeXeGeiMUBoK6oxY2v+pt22Xb0YxCaemLCFV3Z6g3QmGKqDtq4fijj/S2bbLtaEahTNTuDGX2rVy9EQpnnnh03VELC44/ytu2ybajGYVS2dzYKRA/AC6m+OdfA1wVEb0129wAnBkR16eLCJdFxOWpL9ADFJ97TgBWAaeOdBGhqrmxB6/Q1I5QAN6wrPZqjrdt/W3rXECoe0Wh0snlJf0q8NcUl7Hvi4hPSroN6ImIlZI6Kfqfng28DFwZERvSvh8DfhvYB/xeRHxtpL/lyeWtYuMfoPHkAFnF3J3BbKw5QGYZ2uYUTtI24IVm15H8NLC92UVUZLIe2/aI6B6+sG0CNJFI6omIrmbXUQUf24F8CmeWwQEyy+AAVeOeZhdQIR9bDX8GMsvgdyCzDA6QWQYHKJOk+yRtlfTfNcuOlfSIpOfSz5nNrLERkt4k6VFJz0rqlXRTWt4Ox9Yp6TuSvpuO7eNp+cmSHpe0XtIXJB1+qNdygPL9AzD8C7abgVURcSrFSPKbx7uoMbAP+HBEnAGcB9yQbrVvh2N7DbgoIn4OWAh0SzqPYkqB2yPiFGAnxZQDI3KAMkXEYxQjyWvV3qq+HHjveNY0FiJic0Q8mX7fDayjuCu4HY4tImJPejo1PQK4iGJqASh5bA5QNeZGxOb0+0vA3GYWk0vSPIpbTh6nTY5N0hRJa4GtwCPA88AraWoBOMg0AsM5QBWL4nuClv2uQNJ04J8p7snaVbuulY8tIvZHxEKKu53PAU5v5HUcoGpskXQ8QPq5tcn1NETSVIrwfC4ivpQWt8WxDYqIV4BHgV8Ajkl3UkPJaQQcoGqsBK5Jv18DfKWJtTQkTS92L7AuIpbVrGqHY5st6Zj0+5EUcxeuowjSorRZqWPzSIRMkv4RuJBiKPwW4Fbgy8AK4CSKWywuj4jhFxomNEnnA98EngEG5336Q4rPQa1+bGdRXCSYQvEmsiIibpM0n2L+wmOBp4CrI+K1EV/LATJrnE/hzDI4QGYZHCCzDA6QWQYHyCyDA9QmJM2rHRF+iG2/XXU9k4UDNAlFxNubXUO7cIDakKT5kp6S9M5038taSU9LOjWt35N+3pbWrZX0I0n3p+VX1+z3qdRt0OpwgNqMpNMoxq99iGJYyh1p0GQXxQjjIRFxS1p3IcUtGXdJegtwBfCLad1+4DfHp/rWU2mTYRt3synGb10WEc9KWg18TNKJwJci4rnhO6Qxb58FlkXEE5JupGj6vKZYxZG0+IDRKvkdqL28CrwInA8QEQ8AlwL/Bzwo6aI6+ywFNkXE/em5gOURsTA9TouIpZVX3qIcoPbyOvA+4IOSrkqDIzdExN9QvDOdVbuxpPcAvwQsrlm8ClgkaU7a5lhJPzsu1bcgn8K1mYjYK+nXKO6y/FfgCkn9FHeP/smwzZdQ3HX5nXS6tjIibpH0R8DXJXUA/cANTJyJ+ycUj8Y2y+BTOLMMDpBZBgfILIMDZJbBATLL4ACZZXCAzDL8P8f0QbsmO2CpAAAAAElFTkSuQmCC",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.relplot(data=p66_cd47_df, x=\"ksize\", y=\"jaccard\", height=3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 193,
+ "id": "5afd7f1c-d1e1-40c0-97b8-396fcd0d7c52",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T19:00:34.862568Z",
+ "iopub.status.busy": "2024-11-12T19:00:34.862436Z",
+ "iopub.status.idle": "2024-11-12T19:00:34.950627Z",
+ "shell.execute_reply": "2024-11-12T19:00:34.950263Z",
+ "shell.execute_reply.started": "2024-11-12T19:00:34.862557Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 193,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAANEAAADRCAYAAABSOlfvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWz0lEQVR4nO2de5RcVZWHv1+HlsaQAIYAEYhJEEQiTJAWEc0MomDEBw4IxAcIOuJCniK+QVFH14JBHBkdmQjKY/GKIBLezwDCACEkIS9QIDSKQkiiBBKmIaH3/HFOJZVKVXWdvnW7qrr2t9Zdde+559zaJ9U7Z59z99lbZobjOAOno9ECOE6r40rkOBlxJXKcjLgSOU5GXIkcJyMtr0RTpkwxwA8/6nEMiJZXouXLlzdaBKfNaXklcpxG40rkOBlxJXKcjGzSaAEcp5S+PqNnxWqWvtTLtiO7GDdqOB0darRYFXElcpqKvj7jlkXPc+r0efSu6aOrs4NzD5/ElInbNa0iuTnnNBU9K1avUyCA3jV9nDp9Hj0rVjdYssr4SOQ0jHJm29KXetcpUIHeNX0sfakXoClNPFcipyFUMtt2GzOCrs6ODRSpq7ODNa8bB533h6Y08dyccxpCJbPt9T449/BJdHWGP82uzg7OOnQPzrhuQdOaeD4SOYNCqem2YvWrZc22Zat6mTJxO3Y9aTIvvNzLNiNC3WdW/N9GdV94uZcJozcfzG6UxZXIyZ1ypttZh+7BW0ZttoFydHV2sM2ILjo6xITRm2+gIOVMvG1GdA1qPyrh5pyTO+VMt29cM58fHrz7BmbbuYdPYtyo4Ru1Hzdq+EYmXqW6jcBHIid3Kq24dQ4TNxWZbZVW3Do6tJGJ56tzzpCmdP4zZouusubYtiO7NjLbKlHOxGsWXImculJp6frnn96TEy6fu0FZs5hjWVGrh8zq7u622bNnN1oMJ7Jk2ap173MKdHV2cOOJk5FoSnOsiAEJ5CORM2BSPA6WreplnwlbN6U5lhVXImdApHocNMtydB74ErczIFI8DobS/KccuY5EkrqAe4FN43ddbWbfk3QCcAqwEzDazJbH+gJ+BhwEvAIcbWZz8pTRqY0sHgdNOv+pG3mbc68C+5vZKkmdwH2SbgbuB24A7i6p/2Fg53i8G/hl/HQaSD08DoYyuZpzFlgVLzvjYWY218x6yjQ5GLgktnsQ2FLSmDxldPonq8fBUCf3hQVJw4BHgLcCvzCzh6pU3x74S9H1s7HsuZJnHgscCzB27Ni6yuvUbrrV6nEw1MldiczsdWCSpC2BayW9w8wWZnzmNGAahPdE2aV0CqSYbikeB0OZQVudM7MXgZnAlCrV/grsWHS9QyxzBgk33dIZ0EgkqQPY3Mxe6qfeaGCNmb0oaTPgAOCsKk1mACdIupKwoLDSzJ6rUt+pM1mdRduRmkciSZdLGilpOLAQWCzpa/00GwPMlDQfeBi43cxukHSSpGcJI818SRfE+jcBS4AngV8BX07sj5NAX5+xZNkqHnhqOUuWraKvz9h2ZNe6EadAselW8DpwBVpPzb5zkuaZ2SRJnwHeCXwTeMTM9shTwP5w37mBUcnj4MC3b8ttjy1tqZBVdSR337nO+K7nE8DPzWyNJJ/UtyiVPA5uOmly270szUqKEp0P9ACPAvdKegtQdU7kNA+1LlsX4hb4qlvt1KREcSFhqZltX1T2Z+D9eQnm1I9UjwMnjZoWFsysD/h6SZmZ2dpcpHLqii9b50uKOXeHpNOAq4B1Ab/M7O91l8rJRLt4HDRL4PsUJToifh5fVGbAhPqJ42SlXTwOminwfc3vicxsfJnDFajJaBfTrZkC39c8Ekl6I3AqMNbMjpW0M/A2M7shN+mcqqRsz251062USv1sRFTUFHPuNwRv7H3j9V+B3xL2BTmDTOr27FY23cpR8Kxohm3oKQ6oO5nZ2cAaADN7hQG+4XWy0+7bs5spKmrKSPRadCI1AEk7EXauOoOAb8/ekGaKipqiRN8DbgF2lHQZ8F7g6DyEcjbEt2eXp1n6mbI6dztwCEFxrgC6zezufMRyimmXFbdWJXU/0fbAsNjunyVhZr+rv1hOMe2y4taqpCxx/xrYA1gEFH5RA1yJ6kweAeGd/EgZifYxs91yk8QB2jMgfKuTsinvQuAnZrY4X5HSGGqb8lo8IHyrk/umvEuAByQ9T1jaFsGZu6E7W1sZDwg/NEhRoguBI4EFrJ8TOQPEA8IPHVI8FpaZ2Qwze9rMnikcuUk2xGl3j4OhRMpINFfS5cD1FHkq+BJ3bbjHwdAlRYk2IyjPgUVlvsRdA+5xMLRJUaKvlu5ilTS+zvIMSSp5HEw7sptjL53ty9YtTooSXS/pw4Wop5LeTtgK8Y5KDarkJxoPXAmMImyvONLMXpO0KWEVcC9gBXBEhewRTU27bM92AilK9GOCIn0EeBvhj/0z/bSplJ/oVOCnZnalpPOBLxByEX0B+IeZvVXSVELI4SMqPbwZaZft2c56UhxQbwR+CtwGXAT8q5nN66dN2fxEwP7A1bH8YkJASAj5iS6O51cDH4jZ81oGdxZtP/odiST9F3EPUWQL4ClC4HnM7KR+2m+Qnyi2fbEo3FYhBxEU5Scys7WSVhJMvuUlz2za/ETuLNp+1GLOlfrUPJLyBaX5iYBdU9pXeGZT5Ccq53FQaduym25Dl36VyMwu7q8OgKRrzOzQKs95UdJM4D2ENJKbxNGoOAdRIT/Rs5I2IYx6K2r5/sGmWkD4cw+ftFG5m25Dl3pmytsofFaV/EQzgU8SVug+B1wXm8yI1w/E+3dZrR6yg4wHhHcK1FOJyv2xjwEujvOiDmB6zE+0GLhS0r8Dcwl+ecTPSyU9CfwdmFpH+TLhAeGdSuSas9XM5gN7lilfAuxdprwXOCxPmQaCB4R3qlHPnK1D1l7xZWunGkkjUZzXjDWzP5a5/Y36iNR43OPASSElxsLHgHOANwDjJU0CfmBmHwcws9tykXCQcY8DJ5UUc+5MwjzmRYDorTDkHFDddHNSSTHn1pjZyhIvnKZcfq6Vdg4I79SPFCVaJOnTwLCYEeIk4H/zESt/2j0gvFM/Usy5E4GJBM/sKwhJj0/JQaZBwbdnO/Wi5pEoZoH4TjxanmpRddzjwEkhZXVuJmXmQGa2f10lyolao4r69mwnlZQ50WlF513AoUBLZA/3qKJOntQcAbVsY2mWmW3kvjOY1BIB1aOKOjWSbwRUSW8quuwgxEHYYiBfmjcp4ak8qujQo9yrizz/c0wx5x4hzIlEMOOeJsREaCrcWbS9qWS6T5m4XW6KlBJjYbyZTYifO5vZgWZ2Xy5SZcA9DtqbSq8uelaszu07U8y5Q6rdb5ZIqO5x0N5U+v0L+7zyIMWc+wKwL3BXvH4/wWNhGU0UCdVjHLQ3lX7/PE33FI+FTmA3Mzs0xlKYCHSa2TFm9vl8xEunmVKzO4NPI37/lCRfj5nZ24uuO4BFxWWNoNwSd2F1xk239iTD7597kq87Jd1K8JuDEJn0joF8ad64x0F7M9i/f4rv3AlxcWFyLJpmZtfmI5bjtA5J28PjClxTLCA4TrNQSxjh+8zsfZJeZkMH1ELO1pG5Sec4LUAtEVDfFz9H5C+O47QeSSGzJA2T9GZJYwtHP/V3lDRT0mJJiySdHMv/SdIDkhZIul7SyKI235L0pKQ/SvrQwLrlOINHisfCicD3gKWszx5uwB5Vmq0lZNibI2kE8Iik24ELgNPM7B5Jnwe+BpwhaTdC1NOJwJuBOyTtEoPiO05TkrKwcDLwNjOrOcC8mT0HPBfPX5b0GCF9yi6EDHoAtwO3AmcQ8hNdaWavAk/HcMJ7E2JzO05TkmLO/QVYOdAvkjSOEFL4IWARQWEghA3eMZ6vy08UKc5d5DhNScpItAS4W9KNhGAlAJjZuf01lLQ5cA1wipm9FE248ySdQcgE8VqK0M2c5MtpP1KU6M/xeEM8aiLmar0GuKzg6W1mjwMHxvu7AB+J1Qv5iQoU5y5aR7Mk+XIcSPNY+H7qw2O+1QuBx4pHLEnbmNkL0f/udOD8eGsGcLmkcwkLCzsDs1K/13EGk7yj/bwXOBJYIGleLPs2sLOk4+P174DfxGctkjQdWExY2TveV+acZifFi3uvost10X7M7Ot5CFYrtQQqcZwaydeL28xKEx7fL8lNLaftGZLRfhxnMBly0X4cZ7BJMeeq5iKSdICZ3Z5dJMdpLeqZs/WsOj7LcVoGT3zsOBmppxK554DTltRTiRynLamnEvXU8VmO0zIkBSqRtC8wrridmV0SP6uGGXacoUrKy9ZLgZ2AeUDBn82AS+ovluO0DikjUTchjLAvIDhOESlzooXAdnkJ4jitSspItDWwODqdFu9s/XjdpXKcFiJFic7MSwjHaWVSfOfuqXZf0gNm9p7sIjlOa1HP90SeANVpS9ztx3Ey4m4/jpMR9+J2nIzUrESSTpS0VZUqR9ZBHsdpOVJGom2BhyVNlzQlxpRbh5ktrK9ojtMa1KxEZnY6IZjihcDRwBOSfixpp5xkc5yWIGlOFP3mno/HWmAr4GpJZ+cgm+O0BClzopMlPQKcDdwP7G5mxxFCZx1aoU2lJF+TJD0oaZ6k2ZL2juWSdF5M8jVf0jsz99BxcibF7Wcr4BAze6a40Mz6JH20QptKSb7OBr5vZjdLOihe7wd8mGAy7gy8G/hl/HScpqWmkUjSMGBqqQIVMLPHKpQ/Z2Zz4vnLQCHJlwGFFJNbAH+L5wcDl1jgQWBLSWNq7YzjNIKaRiIzez3mUB1rZn8eyBeVJPk6BbhV0jkERd43VquU5Ou5kmd5fiKnaUhZWNgKWCTpTkkzCkctDUuTfAHHAV8xsx2BrxBW/GrGzKaZWbeZdY8ePTqlqePUnZQ50RkD+YJySb6AzxFywAL8lpAIGWpM8uU4zUTKe6J7CBF9OuP5w8Ccam0qJfkizIH+JZ7vDzwRz2cAR8VVun2AlTF5suM0LSmBSr5ImIe8iRCwZHtChrsPVGlWKcnXF4GfSdoE6I3PBbgJOAh4EngFOKZW+RynUaSYc8cDexMWBjCzJyRtU62Bmd1HZcfUvUoL4svc48vUdZymJWVh4VUzW5flO44ivofIaXtSlOgeSd8GNpN0AGFB4Pp8xHKc1iFFib4JLAMWAF8izF9Oz0Mox2klUgKV9AG/iofjOJGU1bmnKTMHMrMJdZXIcVqM1DDCBbqAwwjL3Y7T1qS8bF1RdPzVzP4T+Eh+ojlOa5BizhXv7ekgjExJqVkcZyiSogQ/Yf2caC3BBeiwegvkOK1GihLdQFCiggeCAR8txCsp8Y1znLYhRYn2At4FXEdQpI8Bs1jvPOo4bUmKEu0AvDPuUEXSmcCNZvbZPARznFYhNe7ca0XXr8Uyx2lrUkaiS4BZkq6N158ALqq3QI7TaqS4/fxI0s3A5Fh0jJnNzUcsx2kdkt7zxMg9VXezOk674alVHCcjrkSOkxFXIsfJiCuR42TElchxMuJK5DgZcSVynIzkqkRV8hNdFXMTzZPUUxTYEUnfivmJ/ijpQ3nK5zj1IO9NdWXzE5nZEYUKkn4CrIznuwFTgYnAm4E7JO1iZq/nLKfjDJhcR6Iq+YmAdbG6DweuiEUHA1ea2atm9jQhnPDeecroOFkZtDlRSX6iApOBpWZW2JNUKT9R6bOOjWkqZy9btiwniZ2hRF+fsWTZKh54ajlLlq2ir69+wXsHJUZCmfxEBT7F+lGoZsxsGjANoLu720MZO1Xp6zNuWfQ8p06fR++aPro6Ozj38ElMmbgdHR2VQsXXTu4jUYX8RIVY3ocAVxVV9/xETt3pWbF6nQIB9K7p49Tp8+hZsbouz897da5SfiKADwKPm9mzRWUzgKmSNpU0npAAeVaeMjpDn6Uv9a5ToAK9a/p44eXeujw/75GokJ9o/6Il7YPivamUmHJmtgiYDiwGbgGO95U5Jyvbjuyiq3PDP/Wuzg62GdFVl+crpARqXbq7u2327NmNFsNpYhLmRAOaIHnwRWfI09Ehpkzcjl1PmswLL/eyzYguxo0aXpdFBXAlctqEjg4xYfTmTBi9ef2fXfcnOk6b4UrkOBlxJXKcjLT86pykZcAzjZajAlsDyxstRJ0Zyn1abmZTUhu3vBI1M5Jmm1l3/zVbB+/Txrg55zgZcSVynIy4EuXLtEYLkAPepxJ8TuQ4GfGRyHEy4krkOBlxJRogkrokzZL0aIxk9P1YPl7SQzFi0VWS3hDLN43XT8b74xragTJU6dMJUW6TtHVRfUk6L96bX5Jhvmmo0q/LYlSphZJ+HTeQpvfLzPwYwEFwm988nncSYkfsQ9gPNTWWnw8cF8+/DJwfz6cCVzW6Dwl92hMYR8gYv3VR/YOAm2O7fYCHGt2HxH4dFO+JsLet8Fsl9ctHogFigVXxsjMeBuwPXB3LLyZkFIQQyejieH418AEVUq83CZX6ZGZzzaynTJODgUtiuweBLSWNGSRxa6ZKv26K94ywg3qHWCepX65EGZA0LAaefAG4HXgKeNHM1sYqxdGK1kUyivdXAqMGVeAaKO2TmT1UpXpN0ZmagWr9imbckYTd1JDYL1eiDJjZ62Y2ifA/2N7Aro2VKDulfZL0jgaLVBf66dd/A/ea2R8G8mxXojpgZi8CM4H3EIb+wmbH4mhF6yIZxftbACsGV9LaKepTNYfMlovOVNovSd8DRgOnFlVL6pcr0QCRNFrSlvF8M+AAQoTXmcAnY7XPAdfF8xnxmnj/rmiLNw0V+vR4lSYzgKPiatY+wEozey5/SdOo1C9J/wZ8CPiUmRWHA0rrV6NXTlr1APYA5gLzgYXAd2P5BMIk9Ungt8CmsbwrXj8Z709odB8S+nQSYV6wFvgbcEEsF/ALwlxwAdDd6D4k9mttlH1ePArlSf1ytx/HyYibc46TEVcix8mIK5HjZMSVyHEy4krkOBlxJXKcjLgSOcTk01v3X9MphytRC1DkRtSStLr8/eFKlAOSviPpT5Luk3SFpNMk3S2pO97fWlJPPB8m6T8kPRw3gH0plu8n6Q+SZgCLJf1A0ilF3/EjSSdX+P794vddLenxuPms320XkjaTdLOkL0oaF9teFPtymaQPSrpf0hOS9o5thscNbbMkzZV0cCw/WtIMSXcBd0oaI+lehRxVCyVNzvav3EQ02iVjqB3AXgRXkTcCIwluPqcBdxPdRwgRN3vi+bHA6fF8U2A2MB7YD1gNjI/3xgFz4nkHwSVlVAUZ9iNstdgh1n0AeF8VmXvi8+8Ajir6vrXA7vEZjwC/JrjEHAz8Ptb7MfDZeL4l8CdgOHA0wVXoTfHeV4HvxPNhwIhG/1b1Oob0MNsgJgPXmtkrAHEkqcaBwB6SCk6rWxDSbL4GzDKzpwHMrEfSCkl7AtsCc82smhf4LIupPOM+mnHAfVXqXwecbWaXFZU9bWYL4jMWAXeamUlaEJ9XkP/jkk6L113A2Hh+u5n9PZ4/DBS2YP/ezOZVkaWlcHNu8FjL+n/v4jyHAk40s0nxGG9mt8V7pZl5LyD8D38MYVSoxqtF56/Tfy6q+4EpJWZf8TP6iq77ip4n4NAi+cea2WOl8pvZvcA/E7YUXCTpqH7kaRlcierPvcAn4vxiBPCxWN5DMPVg/VYJgFuB44qCZOwiaXiFZ19L2AfzrtiunnwX+AfBezmFW4ETC8oXR8qNkPQWYKmZ/Yrwn0FTBjUZCK5EdcbM5gBXAY8Sgl08HG+dQ1CWuYQ5UYELCIme50haCPwPFUYNM3uNsF9puuWTEPpkYDNJZye0+SEhZsH8aPL9sEK9/YBHY/+PAH6WRdBmwrdC5IykM4FVZnZOHZ7VAcwBDjOzJ7I+z6kPPhK1CJJ2I6z03ekK1Fz4SNTCSNoduLSk+FUze3eF+tcSls+L+YaZ1Xt+1Va4EjlORtycc5yMuBI5TkZciRwnI65EjpOR/wedEK9zJEHLIwAAAABJRU5ErkJggg==",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.relplot(data=p66_cd47_df, x=\"query_n_kmers\", y=\"query_n_unique_kmers\", height=3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 194,
+ "id": "fabf3a25-b8e1-4a83-9618-edfddf8939f6",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T19:00:34.951942Z",
+ "iopub.status.busy": "2024-11-12T19:00:34.951149Z",
+ "iopub.status.idle": "2024-11-12T19:00:35.041065Z",
+ "shell.execute_reply": "2024-11-12T19:00:35.040683Z",
+ "shell.execute_reply.started": "2024-11-12T19:00:34.951924Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 194,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAANEAAADQCAYAAACZZoRKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUmklEQVR4nO3df7TUdZ3H8efr4q1rCv4gEDMRMNSkjPKumelWtBZhRmmhmabm5m4HI2Pbflpq7XZObtFq7qmlNcW2RNLa0MxSI7MiEYRUsI4KZLr8NhBoLwL3vX98P3MZhpm587nf+d6Z78z7cc6cO/Od73fmM/fM+3w/38+8P5+3zAzn3MB1NLoBzuWdB5FzKXkQOZeSB5FzKXkQOZdS7oNo8uTJBvjNb4NxKyv3QbRx48ZGN8G1udwHkXON5kHkXEoeRM6ltF+jG5A3vb3G6k3bWfd8D4cN62LM8APo6FCjm+UayIMoQm+vcffytcyct4yenb10dXYwa9pEJk8Y5YHUxrw7F2H1pu19AQTQs7OXmfOWsXrT9ga3zDWSn4kqKNdtW/d8T18AFfTs7OW57TsAvIvXpjyIyqjUbTv2sKF0dXbsFUhHDd+fZzf3cP4Ni7yL16a8O1dGpW7bkA6YNW0iXZ3Jv62rs4MvTX01n7r9Ee/itTE/E5VRqdu29vkeJk8YxXEzTmP91h5GDu2quO/6rT2MGX6Aj+S1AQ+iMg4b1rVPt62rs4ORQ7vo6BDjRhzIuBEH7vVc6b6jhnX5SF6b8O5cGWOGH7BPt23WtImMGX5Azfvu7sVH8tqEn4nK6OjQPt22Sl2xSvs+uGpTxW5e8VnM5Z8HUQXlum0x+1bqEo4a1sXKDdv8OqmFeHcuI+W6edef91pWrNnKlOse4P3ffpAp1z3A3cvX0ttbcaqKywHlfcms7u5uW7x4caObUVbhB9tCN88MzvjGA/ucne6acZp38fKhbJfBu3MZKu3mLXxqow+HtyAPokFU7TrJh8Pzy6+JBpEPh7emTM9EkrqAXwEvDu91m5ldKeky4HLgaGCEmW0M+wu4FpgC/BW4yMwezrKNg2kgw+HezWt+WXfndgCTzGybpE7g15J+CvwGuBP4Zcn+7wDGh9vrgW+Gvy0jdjjcu3nNL9PunCW2hYed4WZmttTMVpc5ZCpwczjud8DBkg7Pso3NwLt5+Zb5wIKkIcAS4BXAf5jZg1V2PwL4c9HjZ8K2NSWveSlwKcDo0aPr2t5GiO3m+fyl5pJ5EJnZbmCipIOBH0l6lZk9lvI1ZwOzIfmdKH0rG6/Wbp7PX2o+gzY6Z2abgQXA5Cq7PQscWfT45WFbWyrXzfP5S81nQGciSR3AgWb2fD/7jQB2mtlmSfsDpwNfqXLIfOAySXNJBhS2mNmaKvu3tHLdPJ+/1HxqDiJJ3wf+EdgNPAQMk3Stmf1blcMOB+aE66IOYJ6Z3SlpBvBJYBTwiKS7zOzvgbtIhrefJBnivnggH6qV+Pyl5ldz7pykZWY2UdIHgNcBnwaWmNkJWTawP82cO5eFaus/eF5e5lLnznWG33reDVxvZjsltcRFfZ74/KXmExNE3wJWA78HfiXpKKDqNZHLhs9fai41BVEYSFhnZkcUbXsaeEtWDXNxCiN5xd28wvwlv07KVsw10WIz6864PdHa7ZqoGp+/lLnU10T3SvoEcCvQ96OEmT2XsmEN1UoL1Pv8pcaICaJzwt/pRdsMGFe/5gyuVl+g3hNbB0fNGQtmNrbMLbcBBK2/QL0ntg6OmB9bXwLMBEab2aWSxgPHmtmdmbUuY9V+/W+FawafvzQ4YrpzN5JkY58SHj8L/IBkXlAuVVvptFX4/KXsxSSgHm1m1wA7Aczsr1QYrciLmJVOW4l38+or5kz0QkgiNQBJR5PMXM2tmJVOW4nPX6qvmCC6ErgbOFLS94A3Ahdl0ajBFLPSaSvx+Uv1EzM6dw9wFkng3AJ0m9kvs2mWawSfvzQwsfOJjgCGhOP+VhJm9sP6N8s1Quz8pXY7e1cSM8T9HeAEYDlQ+K8a4EHUQmqdvzRyaFdLZXukEXMmOtnMjs+sJa4plUtsnTVtIqMPeYkPhwcxCag3AF8zsxXZNimOJ6BmrzSxtfCD7JTr2i65NXUC6s3AQklrSYa2RbKGXENntrrslevi+VoPe8QE0Q3ABcCj7Lkmcm3Ksx72iMlY2GBm881slZn9qXDLrGWuqXnWwx4xZ6KlYcWfOyjKVPAh7vbkWQ97xATR/iTB87aibT7E3cY86yERMzp3aOksVkljzWxVJi2rkY/ONZdyEx1nX9DNpd9d3AojealH5+6Q9I7CqqeSXkkyFeJVFd+xcn2iscBcYDjJ9IoLzOwFSS8mGQU8EdgEnFOheoRrUu24amtMEH2ZJJDOAI4l+bJ/oJ9jKtUnmgl83czmSvoWcAlJLaJLgL+Y2SsknUuy5PA5lV7cNad2W7U1JgH1J8DXgZ8DNwHvMbNl/RxTtj4RMAm4LWyfQ7IgJCT1ieaE+7cBbw3V81yOtfpIXr9nIknfIMwhCg4CniJZeB4zm9HP8XvVJwrHbjazXWGXQg0iKKpPZGa7JG0h6fJtLHnNlqpP1OpafSSvlu5c6VX7kpg3KK1PBBwXc3yF12y5+kStrpVH8voNIjOb098+AJJuN7Ozq7zOZkkLgDeQlJHcL5yNimsQFeoTPSNpP5Kz3qZa3t/lT7nk1i9NffVeI3mFLt5xTTySV89Kefssn1WlPtEC4L0kI3QXAj8Oh8wPjxeG539htY7Bu9xplflL9Qyicl/2SvWJVgBzJf0LsJQkL4/w97uSngSeA86tY/tcE2qF+UuZ1mw1s0eA15bZvhI4qcz2HuB9WbbJNbc8zl+qOWOh3xeSlprZPgGTNc9YaD1NPH8pdcYC4bpmtJn9sczTnxpIq5wrlbf5SzFrLJwJfBV4ETBW0kTgi2b2LgAz+3kmLXSO5p6/FDOf6CqS65jNACFbYWzdW+RcGc2c9RDTndtpZltKsnB8+NkNimbOeogJouWSzgOGhIoQM4DfZtIq58po1qyHmO7cR4EJJJnZt5AUPb687i1yLkIzrNpa85koVIH4XLg51xSaYf5SzOjcAspcA5nZpFQtcC6lRs9fiunOfQL453D7PLCMfTO8nWu4wR7Ji+nOlU6B+I2kRane3bkMDKTMZpqsh5ju3KHF7SRZB+GgAb9zAzRrAqOrv9gymys3bBvw9yJmiHsJyTWRgF3AKpI1EXKh3Co0zZLA6AZHueTW6897LSvWbE31vahbAmqj1JqAunLDtmZIYHQNVprcagZnfKPm70W6BFRJZ1V7vtlXQs3jZC9Xf6XdvIVPbUz9vYjpzl0CnAL8Ijx+C0nGwgZysBJqpf7wyKFdDWyVa7R6fC9ihrg7gePN7OywlsIEoNPMLjazD0W8TkNUGvYcM/yABrfMNVI9vhcxywg/bmavLHrcASwv3tYIMZPyyk328kEFF/G9SD0p7z5JPyPJm4NkZdJ745rbWOWGPZ1L+72I+bH1sjC4cFrYNNvMfjSgd3WuhURNDw8jcE09gODcYKtlGeFfm9mpkraydwJqoWbrsMxa51wO1LIC6qnh79Dsm+Nc/sQMcSNpiKSXSRpduPWz/5GSFkhaIWm5pI+F7a+RtFDSo5LukDSs6JjPSHpS0h8lvX1gH8u5wROTsfBR4EpgHXuqhxtwQpXDdgH/ZGYPSxoKLJF0D/BfwCfM7H5JHyJMr5B0PMmqpxOAlwH3SjomLIrvXFOKGVj4GHCsmdW8wLyZrQHWhPtbJT1OUj7lGJIKegD3AD8jmaM0FZhrZjuAVWE54ZNI1uZ2rinFdOf+DGwZ6BtJGkOypPCDwHKSgIFk2eAjw/2++kRBce0i55pSzJloJfBLST8hWawEADOb1d+Bkg4EbgcuN7PnQxfuOkmfJ6kE8UJMo73Il2smMUH0dLi9KNxqEmq13g58r5DpbWZ/AN4Wnj8GOCPsXqhPVFBcu6iPF/lyzSQmY+Hq2BcP9VZvAB4vPmNJGmlm60P+3RXAt8JT84HvS5pFMrAwHvAp6K6pZb3azxuBC4BHJS0L2z4LjJc0PTz+IXBjeK3lkuYBK0hG9qb7yJxrdjFZ3CcWPewCzgZ2mdkns2hYrby0ihtE6bK4fbUf58prq9V+nMtC26z241xWYrpzVWsRSTrdzO5J3yTn8iUqAbUfX6njazmXG/UMIl+swLWlegaRZw64tlTPIHKuLdUziFbX8bWcy42ohUoknQKMKT7OzG4Of6suM+xcq4r5sfW7wNEkxb0K+WwG3Fz/ZjmXHzFnom6SZYR9AMG5IjHXRI8Bo7JqiHN5FXMmeimwIiSdFs9sfVfdW+VcjsQE0VVZNcK5PIvJnbu/2vOSFprZG9I3ybl8qefvRF4ty7UlT/txLiVP+3EuJc/idi6lmoNI0kclHVJllwvq0B7ncifmTHQY8JCkeZImhzXl+pjZY/VtmnP5UHMQmdkVJIsp3gBcBDwh6cuSjs6obc7lQtQ1UcibWxtuu4BDgNskXZNB25zLhZhroo9JWgJcA/wGeLWZfYRk6ayzKxxTqcjXREm/k7RM0mJJJ4XtknRdKPL1iKTXpf6EzmUsJu3nEOAsM/tT8UYz65X0zgrHVCrydQ1wtZn9VNKU8PjNwDtIuozjgdcD3wx/nWtaNZ2JJA0Bzi0NoAIze7zC9jVm9nC4vxUoFPkyoFBi8iDgf8P9qcDNlvgdcLCkw2v9MM41Qk1nIjPbHWqojjazpwfyRiVFvi4HfibpqySBfErYrVKRrzUlr+X1iVzTiBlYOARYLuk+SfMLt1oOLC3yBXwE+LiZHQl8nGTEr2ZmNtvMus2se8SIETGHOld3MddEnx/IG5Qr8gVcSFIDFuAHJIWQocYiX841k5jfie4nWdGnM9x/CHi42jGVinyRXAO9KdyfBDwR7s8HPhhG6U4GtoTiyc41rZiFSj5Mch1yKMmCJUeQVLh7a5XDKhX5+jBwraT9gJ7wugB3AVOAJ4G/AhfX2j7nGiWmOzcdOIlkYAAze0LSyGoHmNmvqZyYemLphvBj7vQy+zrXtGIGFnaYWV+V73AW8TlEru3FBNH9kj4L7C/pdJIBgTuyaZZz+RETRJ8GNgCPAv9Acv1yRRaNci5PYhYq6QW+HW7OuSBmdG4VZa6BzGxcXVvkXM7ELiNc0AW8j2S427m2FvNj66ai27Nm9u/AGdk1zbl8iOnOFc/t6SA5M0WVZnGuFcUEwdfYc020iyQF6H31bpBzeRMTRHeSBFEhA8GAdxbWKynJjXOubcQE0YnA3wA/JgmkM4FF7Ekeda4txQTRy4HXhRmqSLoK+ImZnZ9Fw5zLi9h1514oevxC2OZcW4s5E90MLJL0o/D43cBN9W6Qc3kTk/bzr5J+CpwWNl1sZkuzaZZz+RH1O09YuafqbFbn2o2XVnEuJQ8i51LyIHIuJQ8i51LyIHIuJQ8i51LyIHIupUyDqEp9oltDbaJlklYXLeyIpM+E+kR/lPT2LNvnXD1kPamubH0iMzunsIOkrwFbwv3jgXOBCcDLgHslHWNmuzNup3MDlumZqEp9IqBvre5pwC1h01RgrpntMLNVJMsJn5RlG51La9CuiUrqExWcBqwzs8KcpEr1iUpf69JQpnLxhg0b9nmv3l5j5YZtLHxqIys3bKO31xdqddkZlDUSytQnKng/e85CNTOz2cBsgO7u7r0ipLfXuHv5WmbOW0bPzl66OjuYNW0ikyeMoqOj0rLgzg1c5meiCvWJCmt5nwXcWrR76vpEqzdt7wsggJ6dvcyct4zVm7YP8BM4V13Wo3OV6hMB/B3wBzN7pmjbfOBcSS+WNJakAPKimPdc93xPXwAV9OzsZf3Wnuj2O1eLrM9EhfpEk4qGtKeE586lpCtnZsuBecAK4G5geuzI3GHDuujq3PtjdXV2MHJo1wA/gnPVKSkJlF/d3d22ePHivsd+TeQyVPYL1HKLL3Z0iMkTRnHcjNNYv7WHkUO7GDP8AA8gl5mWCyJIAmnciAMZN+LARjfFtQHPnXMuJQ8i51LyIHIupdyPzknaAPyp0e0IXgpsbHQjMtTun2+jmU0u3Zj7IGomkhabWXf/e+aTf77yvDvnXEoeRM6l5EFUX7Mb3YCM+ecrw6+JnEvJz0TOpeRB5FxKHkQDJOk7ktZLeqxo26GS7pH0RPh7SCPbmEaVlZpa4jNK6pK0SNLvw+e7OmwfK+nBsOLUrZJe1N9reRAN3E1A6Q9vnwbuM7PxwH3hcV4VVmo6HjgZmB5WY2qVz7gDmGRmrwEmApMlnQx8Bfi6mb0C+AtwSX8v5EE0QGb2K+C5ks1TgTnh/hySaoK5VGWlppb4jJbYFh52hpsBk4DbwvaaPp8HUX0dZmZrwv21tEhN25KVmlrmM0oaEhYOXQ/cAzwFbDazXWGXsqtNlfIgyoglvx3k/veDKis15f4zmtluM5tIsiDOScBxA3kdD6L6WifpcIDwd32D25NKhZWaWuozApjZZmAB8Abg4LASFdS42pQHUX3NBy4M9y8EftzAtqRSZaWmlviMkkZIOjjc3x84neS6bwHw3rBbTZ/PMxYGSNItwJtJ0ufXAVcC/0OyWtFokukZ08ysdPAhFySdCjwAPAoU1iD7LMl1Ue4/o6QTSAYOhpCcTOaZ2RcljQPmAocCS4HzzWxH1dfyIHIuHe/OOZeSB5FzKXkQOZeSB5FzKXkQOZeSB1GLkDSmOKO8n31/m3V72okHURsys1Ma3YZW4kHUgiSNk7RU0pvCnJllkh6RND48vy38/WJRyZtnJd0Ytp9fdNx/ShrSyM/T7DyIWoykY0ny3S4iSV+5NiRZdpNkJfcxsy+E595MMq3jekmvBM4B3hie2w18YHBan08tWRWijY0gyfU6y8xWSFoIfE7Sy4EfFhWY7hNy5P4bmGVmSyRdBpwIPJQ8xf60QJJplvxM1Fq2AE8DpwKY2feBdwH/B9wlaVKZY64CnjGzG8NjAXPMbGK4HWtmV2Xe8hzzIGotLwDvAT4o6byQTLnSzK4jOUOdULyzpDNJaufOKNp8H/BeSSPDPodKOmpQWp9T3p1rMWa2XdI7SWZq3gGcI2knySzUL5fsPpNk5uai0HWbb2ZfkHQF8HNJHcBOYDrNUzSg6XgWt3MpeXfOuZQ8iJxLyYPIuZQ8iJxLyYPIuZQ8iJxLyYPIuZT+H395FTjbDVZsAAAAAElFTkSuQmCC",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.relplot(data=p66_cd47_df, x=\"ksize\", y=\"query_n_unique_kmers\", height=3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 195,
+ "id": "f3906de8-ce4d-4a62-9359-49fa6670318e",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2024-11-12T19:00:35.042994Z",
+ "iopub.status.busy": "2024-11-12T19:00:35.042312Z",
+ "iopub.status.idle": "2024-11-12T19:00:35.195319Z",
+ "shell.execute_reply": "2024-11-12T19:00:35.194994Z",
+ "shell.execute_reply.started": "2024-11-12T19:00:35.042977Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 195,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAADRCAYAAAAKRcT2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAAsTAAALEwEAmpwYAAAdpUlEQVR4nO3deXRddbn/8fcnzXSaJmnSuaWFAi0UBNtSWqYrUBXrxKAgiAP89C4c4AJXXQuvuBTx5116Lw4o/hyuonAVFBAUEcHKIIMFOtLSltJSWjq3SZtmajM+vz/2N+U0TdLTnHNykuzntdZZ2WeP35P2OXvvb/bzfWRmOOcGv7xcN8A51zc82J2LCQ9252LCg925mPBgdy4mBk2wz5s3zwB/+StTr0Fn0AR7VVVVrpvgXL+W1WCXNE/SGknrJH25i+XvkLREUqukSzstu0rS2vC6KpvtdC4OshbskoYAPwbeC5wEfFTSSZ1WexO4Grin07aVwNeBOcBs4OuSKrLVVufiIJtn9tnAOjNbb2bNwO+Ai5JXMLMNZrYcaO+07XuA+Wa228z2APOBeb1pRFt7G23tbb3Z1LlBJT+L+54AbEp6v5noTN3bbSd0XknSNcA1AJMmTTpoWWt7CzvrtrB6x2LazZg2ZgZjSyeRP6TgCD6Cc4PHgO6gM7Ofm9ksM5s1atSog5btqt/G06//iR31m9nVsIVn1j/CzvrNOWqpc7mXzWDfAkxMen9UmJftbQFYX73qkHlrd63AE39cXGUz2BcCUyRNllQIXAE8nOK2jwMXSKoIHXMXhHkpKxxSeMi8/CGFSDqS3Tg3aGQt2M2sFbiOKEhXA/eZ2UpJt0q6EEDS6ZI2A5cBP5O0Mmy7G/gm0RfGQuDWMC9lx1ROI09vfTwhpow6JQOfzLmBSYPlsnbWrFm2aNGiA+/NjOrG7WzZ+wZmxoTyyYwoGXvQF4BzPRh0l4DZ7I3PKUmMLBnHyJJxuW6Kc/2Cn+aciwkPdudiwoPduZjwYHcuJjzYnYsJD3bnYsKD3bmY8GB3LiY82J2LCQ9252LCg925mPBgdy4mPNidiwkPdudiwoPduZjwYHcuJjzYnYsJD3bnYsKD3bmY8GB3LiZyXcW1SNLvw/IXJR0T5hdIukvSCkmrJf1HNtvpXBzkuorrp4E9ZnY88H3gO2H+ZUCRmZ0CnAZ8puOLwDnXOzmt4hre3xWmHwDeqahkiwElkvKBBNAM1Gaxrc4NetkM9lQqsR5YJ1SQ2QuMIAr8BmAbUQ332460Ioxz7mD9tYNuNtAGjAcmA1+UdGznlSRdI2mRpEW7du3q6zY6N6DkuorrgXXCJXs5UA1cCTxmZi1mthN4HpjV+QA9lWx2zh0s11VcHwauCtOXAk9aVHzuTWAugKQS4Azg1Sy21blBL6dVXIFfAiMkrQO+AHT8ee7HwLBQ1XUh8CszW56ttjoXB4O2iqtzaRp0VVz7awedcy7DPNidiwkPdudiwoPduZjwYHcuJjzYnYsJD3bnYsKD3bmY8GB3LgMkHSPplRTX/We229MVD3bn+piZnZWL43qwO5dhko6VtFTSuZJekrRM0nJJU8Ly+vDz1rBsmaQtkn4V5n88abufhVGf0ubB7lwGSToB+ANwNVEm5+1mNp0oRXtz8rpm9rWw7DxgN3CHpGnA5cDZYVkb8LFMtC0/EztxzgEwCvgT8CEzWyVpAXCzpKOAB81sbecNwjBsvwG+Z2aLJV1HNO7iwmgRCWBnJhrnZ3bnMmcv0VgM5wCY2T3AhcA+4FFJc7vY5hZgs5n9KrwXcJeZTQ+vE8zslkw0zoPducxpBi4BPinpyjCU2noz+yHRGf/U5JUlfRB4F3B90uwngEsljQ7rVEo6OhON88t45zLIzBokfQCYD/wZuFxSC7Ad+M9Oq3+BaNDVl8Il+8Nm9jVJXwX+JikPaAGuBTam2zYfvMK5rvngFc65gcmD3bmY8GB3LiY82J2LiX5ZxTUsO1XSAkkrQzXX4my21bnBrl9WcQ3VYX4DfNbMTiZ6nLAlW211Lg76axXXC4DlZvYygJlVm1lbFtvqXL8jaUO4ql0mKe2/K2fzoZquqrjO6W4dM2uV1FHFdSpgkh4net74d2b2X1lsq3O9ds+S2/OAecBMYAnw2JUzb2jP0O7PN7OqTOyo22CX9COiOuldMrPru1uWAflEzxefDjQCT0habGZPdGrjNcA1AJMmTcpic5zrWgj0h4iege/w8D1Lbr8kgwGfET1dxi8CFgPFRN9Ya8NrOlCYwr7TqeK6GXjGzKrMrBF4NLThIF7F1fUD8zg40Anv52Vg30b02OzicGJLS7fBbmZ3mdldRA/vn2dmPzKzHwHvJAr4w0mniuvjwCmShoYvgXOBVUfwuZzrK4echIIZGdj3OWY2k6iT+1pJ70hnZ6l00FUAZUnvh4V5PUqniquZ7QG+R/SFsQxYYmZ/SekTOde3lnQzf2m6OzazLeHnTqJbhdnp7C+VDrpvA0slPUWUHPAOohzcwzKzR4kuwZPnfS1pej9wWTfb/oboz2/O9WePEV2hHnTPHub3mqQSIM/M6sL0BcCtae2zp6y3kGJ3BrCet3rSXzSz7ekcNBs8681lWMpZb0m98TOIzuhp98aHXPiHwtt84B4z+1Za+zxciqukpWaWifuPrPJgdxkWyxTXJyR9ODzs4pwboFIJ9s8A9wNNkmol1UmqzXK7nHMZdtgOOjMr7YuGOOeyK6XHZSVVAFOIHrABwMyeyVajnHOZd9hgl/SvwA1ET8AtI+qdXwB0NSyuc66fSuWe/QaiZ9Q3mtn5RH9eqMlmo5xzmZdKsO8PD78gqcjMXgVOyG6znHOS7pS0M7k6bBhHfr6kteHnYZ9m7ZDKPftmScOBPwLzJe0hA2NYOzdYXP/nQ1Ncf/jBjGS8/Rq4A7g7ad6XgSfM7Nth9KcvAzelsrMjGjde0rlEmWmPhQEp+g1/qMZlWErPlYRAPyTFFbgkEwEfhmp7xMzeFt6vIUpM2yZpHPC0maV0pX3Yy3hJZ0gqBTCzfwBPk5mMHucGg2ymuHZljJltC9PbgTGpbpjKPftPgPqk9/VhnnMuuymuPQrp4ClfmqcS7LKka30za8drxDnXIWsprt3YES7fCT9TLuecSrCvl3S9pILwuoEoC84591aKa7K0U1x7kDzgy1VE1WFTkkqwfxY4i2gIqY5BI9MeIse5wSB0wl0CvB/4aviZqc65e4keYDtB0mZJnyYaX+LdktYSlXv+dsr78yquznVp0GV5ptIbf1f4O3vH+wpJd2a1Vc65jEvlMv5UM6vpeBPGh/M/vTk3wKQS7HnJj+RJqsR7450bcFIJ2u8CCyTdT3QfcymQ1lhYzrm+l8rgFXdLWgycH2Z9yMx8DHfnBpiUCjua2UrgPqK/8dVLSqnWUjolm8PySZLqJX0pleM557qXSm/8heFvem8A/wA2AH9NYbtel2xO8r1UjuXcYNRNiustkraEyq7LJL0v1f2lcs/+TaLRaf5uZjMknQ98PIXtDpRsDo3sKNmcfAtwEW8VnHgAuEOSzMwkXUz0BdOQygdxLlfe/eMfHJLiOv/aG7OV4grwfTO77Uh3lsplfIuZVRP1yueZ2VPArBS266pk84Tu1gnlovYSlYMaRpSj+42eDiDpGkmLJC3atWtXCk1yLrNCoD8E/IXoxPgX4KEwPy1hnMfd6e6nQyoNqgnB9yzwW0m3k/2z7S1E3171Pa3kVVxdP9DXKa4A10laHi7zUx6pJpVgv5CoRvoNRA/3rwM+kMJ26ZRsngP8l6QNwI3AVyRdl8IxnetrfZ3i+hPgOKJKytuI/jSekm7v2SU9Z2bnADt4K2e243nh/ytpN/DfZvb/utnFgZLNREF9BXBlp3U6MngWcHDJ5n9JasctQL2Z3ZHqh3KuD/VpiquZ7eiYlvQ/wCOpbttTffZzws9SMysLr9LwKie6b7+hh+17XbLZuQGkT1NcO3LZg0uAV7pb95Bt08l6kzQuaYicnPKsN5dhKWe9JfXGH6jimone+JDieh4wkugK++vh/XSiq+0NwGdSjUFPcXWua/FLcXXODQ4e7M7FhAe7czHhwe5cTHiwOxcTHuzOxYQHu3P9kKSJkp6StErSylCvIetVXJ1zPZhy022HpLiu/c6X0n2ophX4opktCbUWF0uaD1xNL6u4+pnduTSEQD8kxTXM7zUz22ZmS8J0HdEj5xOIxoC4K6x2F3Bxqvv0YHc0tzbR2t6a62YMVFlPcQ3Dtc0AXiSNKq5+GR9jDc0NbNi9ltU7XqG0qIy3j5/FmNJxSIPuSdFs6inF9dF0dx7GkvgDcKOZ1Sb/24QRnTJaxdUNQmbGa7tW8s8Nz7Bn327erNnAo6sforqxKtdNG2iyluIqqYAo0H9rZg+G2Vmt4uoGocbmBpZvPfj/Y5u1sbvRh/c6QllJcVV0Cv8lsNrMvtdp31mr4uoGIeXlUTCk4JD5Q+R3dkci9LofUsU1A73xZwOfAOZ2GknWq7h6iuuRW1u1hqfXPX7g/dCCEj5w0ocoT6T8p9vBbNB1XPjXeIwdM3wy7512MVtq3qSkcBgTyid6oA9iHuwxVpBfyFHlkziqPKUCP26A83t252LCg925mPBgdy4mshrsva3iKundkhZLWhF+zs1mO52Lg6wFe5pVXKuAD5rZKUQPDvxvttrpXH/UQ4prVqu49lY6VVyTH+1aCSQkFZlZUxbb61yvHPuRbx2S4rr+vpuzleIKWazi2lu9ruLaaZ0PA0u6CnSv4upyLQT6ISmuYX6v9ZDi2mv9uoNO0slEl/af6Wq5V3F1/UBfp7hCFqu49lY6VVyRdBTRN+Ynzez1LLbTuXRktYpr5xRX0qjims1gP1DFVVIhURXXrrKDOjJ4DlRxlTSc6HLoy2b2fBbb6Fy6+jTF1cx2mFmbmbUD/0PUN5aSrAV7mlVcrwOOB76W1Os4OlttdS4NfZrimrMqrv2JZ725DEs56y2pN/5AFdd0e+MlnQM8C6wAOvb1FeCjeBVXD3aXUYMuxbVf98Y75zLHg925mPBgdy4mPNidiwkPdudiwoPduZjwYHeuH5JULOklSS+HFNdvhPmTw9gP68JYEIWp7tMHnHQuTafM/OIhKa4rlnw33RTXJmCumdWHx2afk/RXoidNv29mv5P0U6IxIX6Syg79zO5cGkKgH5LiGub3mkXqw9uC8DJgLtHYD+BVXJ3rU1lLcZU0RNIyonpu84HXgZqQdwJdjxHRLQ9212/UNdWzZe8WqhqqaGtvy3VzUpW1FNeQ3TadKD18NnBiOvvze3bXL2yv286fV/2FhuYG8pTHWUefyanjTqEwP+X+p1zJWoprBzOrkfQUcCYwXFJ+OLt3NUZEt/zM7nJuf8t+nlj7FA3NDQC0WzvPbXieqoYBUT46Wymuo8K4DkhKAO8mShV/imjsB/Aqrm6g2deyj10Nh44hWNtUl4PWHJnQ635IFdcM9MaPA56StJxoIJj5ZvYIcBPwhTAGxAiinPeU+GW8y7nigmIqE5Xs3rf7oPmlRcNy1KIjEwL70fDKCDNbThf3/WG05pRHp0nmZ3aXc4mCBO+aOpfi/OID8+ZMms2oEh9ENJP8zO76hfFl4/nojCuo3b+XovxiKhMV5A/x/56Z5L9N12+UF5dRXlyW62YMWn4Z71xMeLA7FxNZvYyXNA+4HRgC/MLMvt1peRFwN3AaUXGIy81sQ1j2H0QP+bcB15vZ49lsa1xt27aHNzdVUVJSRGlpgq3baygvTzA0UcjWXXupHD6MwoIhbK+qY0TFMJQndu2pZ3RlKW3WTvXeRsaOKGV/Sys1DfsZP6KMvfuaaGxqZnxFGdX1DbS0G2OHl7Kzto485TG6rISte2spys9nxLAE2/bWMbSwgLJEEdv31lGWKGZoYQE76uqoSAylMD+PXfX1VA4dypC8PKobGxgxtIQ22qndv48RJSW0trVR37yfESWl7Gtpoqm1hRFDS6lrbqStvZ3y4gTVDXuRxMih5dS3NDIkbwijSypJFBTl+p+hT2Qt2JOquL6b6BnehZIeNrPkwo4HqrhKuoKo1NPlodrrFcDJwHjg75KmmtmAeYZyIFi1ahOfv/4XVO+O8i3eO28GTW1tPPnsKi67ZA7rt1SzeMVGrr7iHJ5d8jqvb6ricx87j/ufXEpVTQM3fGwuP/3zC7S0tXHj5e/g2w/8g2HFhVx70dncev+TjK8s5RNzZ/Kff/oHU8eO4AOnn8jtf1vAjKPHcvpxR/Gr5xZxztSjOaqyjAeXvMK8t01FecaTr67jQzNPYVttDUs3beHK2TNZtnkTb1RXcdWcOcx/bRV79jVy1Zwz+OOKxbS0tfHx08/kvpcXUJifz2Wnns69y56lIjGMeSe+nT+88hzjSis4Y9IJPL72BY4ePoYTR03guY1LmTbqWC6cdi7DE6U5/tfIvmxexh+o4mpmzUBHFddkFxFl7kCUyfPOMDj+RcDvzKzJzN4A1tHLvy26rjU07ue7tz9yINAB/vrYUk4+YQJmxn0PvsCZM4+jrb2dO+99lgvOPomW1jZ+es8/uOjcU9nf3MrPH3iWi//lZOoam7jr0UV8cM40qmobeXjBKs5/22Q2V9eyeO0W3j5pDK9tr2bb7jqOqihj6cbtYFBaXMRzr21k5LBhFAzJ47FXXmPq6NEY8IclK5g5aSLtZvzmxcWcc9xxtLa3c/dLLzF36ok0tbZyz6KFnHf8NBpbmvnTiqWcefQUavfv4+nX1zB93GSqG+tYuX0LkyvGsq1uD7vqaxmRKGdjzQ5a2oxEQRGrd61nw56tufuH6EP9tYprKtt6Fdc01NbuY9nLGw6Zv39f8yHTZkZzU5Ro1dzSSnt7VGtgT90+EoUFALy5Yw9jyqOHYF55YxtTxo0EYMXG7QemV27aybGjozqEa3fsZmJlOQDba+oYPjQBQENzC3mKhmzf39yR3AXNrdFFXUtbGx21DuqbmigMf57bUVfL8EQJAG/s3sn48koANuzZybiyaHpjzS7GDIuOv61uN5WJ6Phb6nYewW9u4BrQHXRexbX3hpeXMGf2lEPmFycKD5nOyxMFRVFQJYoK6CgrMnJ4CfX7oy+E4yaMZMvuvQDMnHIUKzftAOC04yccmJ5+zFhe214NwAnjRrKxag8A4yvK2N3QCEBJYQHtIZiLC6Nj5kkU5A8BoCg//8DxhycS7G+Jjj+hvILqhujx2qmjxvHmnujLf8qIcWyqiaaPrRzDtvrqsP5IqhprAJhYPvYIfnMDV3+t4prKti4NiUQhN1z3PiZOjM66eXniI5eexZIVG8jPH8KnPnEeT73wKkWF+Vx79Tt55OnllAwt4rpPzuWhJ5cxvDTBNZeew4PPrGB0xTCufM9MHl24homjhnPBrCk89+pGTjxqJNMmjWL11ipmHTuBspJitu+t5/xpk9nX3ExDcwvvf/sJbNqzBzP4yOmnsmLrVvLz8rjqzNNYsP4NivKH8Omz5/DkmjUkCgr41Jln8vc1qygtKuLKWbN5ct1qKhJD+cDJp/LCm2sZPayMs44+nhXbNzKxfATHjRzNpr27OK5yHOXFCfbsq2PaqKNpb2+hqbWZ0yecxDEV43P8r9E3slb+KQTva8A7iQJ1IXClma1MWuda4BQz+2zooPuQmX0k1GW/h+g+fTzwBDClpw46L//UO1XVdWzeXB31xpcl2La9hrLSBImhRWzfuZeK8qEUFOSzc3ctI4YPQ3mwq6aB0RWlNLe1UVO3jzEjSmloaqF+XxPjKsvY27iffS2tjKsopaqugbZ2Y+zwYeyorSdPYnRZCdv21lGUn09lSYJttXUMLSigPFHE9to6hhUXU1JQwI76OoYnEhTmD6Gqvp6KkhLyBLsbG6PeeGujdv9+RpaU0NTaSmNLEyOGltLQsp+WtlYqE8Oob9lHW3s7ZYVDqd5XgyRGDC2nvrmR/LwhjBw6nML8gq5+NYOu/FNWa71Jeh/wA6I/vd1pZt+SdCuwyMwellQM/C/RA/+7gSvCg/5Iuhn4FNBKVJv6rz0dy4PdZZgHe3/lwe4ybNAF+4DuoHPOpc6D3bmYGDSX8ZJ2ARtz3Y5ujAQGxBhLaRpMn7PKzNIeIbY/GTTB3p9JWmRms3LdjmyLy+ccqPwy3rmY8GB3LiY82PvGz3PdgD4Sl885IPk9u3Mx4Wd252LCg925mPBgT5OkiZKekrRK0kpJN4T50yW9IGlZyLmfHeZL0g8lrZO0XFJ3hQH7HUnFkl6S9HL4rN8I8ydLejF8pt9LKgzzi8L7dWH5MTn9AHFnZv5K40VUpmdmmC4lyvQ7Cfgb8N4w/33A00nTfyV69voM4MVcf4Yj+KwChoXpAuDF8BnuI0piAvgp8Lkw/Xngp2H6CuD3uf4McX75mT1NZrbNzJaE6Tqi4nsTAAM6BkEvBzrGProIuNsiLxBV5RzXx83uldDmjnGsCsLLgLlEw4pBNMzYxWG6u2HHXA54kYgMCpepM4jOeDcCj0u6jeh26aywWndDbm3rs4amIQwkuhg4nmhA0deBGouGFYODhxA7aNgxSR3Djg2WR2oHFD+zZ4ikYcAfiHLva4HPAf9uZhOBf+cIqm32Z2bWZmbTiUYPmg2cmNsWuVR5sGeApAKiQP+tmT0YZl8FdEzfz1uj4w6KIbfMrIaoVviZRLciHVeJyZ+nu2HHXA54sKcp3IP+ElhtZt9LWrQVODdMzwXWhumHgU+GXvkzgL1mNlAu4UdJGh6mE0Q1AVYTBf2lYbWrgD+F6YfDe8LyJy301rm+50/QpUnSOcCzwAqgPcz+ClBLVA0nH9gPfN7MFocvhzuAeUAj8H/MbEAMsSPpVKIOtyFEJ4r7zOxWSccS1QWoBJYCHzezpp6GHXN9z4PduZjwy3jnYsKD3bmY8GB3LiY82J2LCQ9252LCg925mPBgjyFJ/+yj41wt6Y6+OJY7PA/2LEt6jLTfMLOzDr/WwBASc1wKPNg7kXSzpNckPSfpXklfkvS0pFlh+UhJG8L0EEn/LWlhGIjiM2H+eZKelfQwsErSrZJuTDrGtzoGueji+OeF4z0g6VVJv+0pLVTSBkkjw/QsSU+H6Vsk3Rn2tV7S9Unb1IefknSHpDWS/i7pUUmXHma/JWG/L0laKumiFH+v75e0IPz+fi3pJ2Fwj/XhM98pabWkXydtc0HYZomk+0OyUUfbviNpCXCZpOsVDR6yXNLvUmlPHPW7s04uSTqNaJCF6US/myVE6Zzd+TTRs+2nSyoCnpf0t7BsJvA2M3sjpL4+CPxAUl44xuwu9xiZAZxM9Hz988DZwHO9+EgnAucTDaqxRtJPzKwlafklwAlEg22MAVYBdx5mnzcTPeP+qfCc/EuS/m5mDd1tIOkS4AvA+8xsT/juqiBKormQ6Bn6s4F/BRZKmk6UKvtV4F1m1iDpprCPW8Nuq81sZtj/VmByeER3+OF/LfHkwX6wfwEeMrNGgHBm7skFwKkdZ0OirK4pQDPwkpm9AWBmGyRVS5pBFFRLzayn7K+XzGxzaMMy4Bh6F+x/MbMmoEnSznDszUnL3wHca1Hd+62SnkxhnxcAF0r6UnhfDEwiSojpylxgFnBBSP3t8GczM0krgB1mtgJA0kqiz3sU0ZfQ8+HLoRBYkLT975OmlwO/lfRH4I8pfIZY8mBPTStv3fIUJ80X8G9m9njyypLOAzqf6X4BXA2M5fBnz6ak6TZ6/nfqrm1Hup9U9yvgw2a2JsX9vA4cC0wFkhN+OtrW3qmd7aGdbcB8M/toN/tN/v2+n+iL64PAzZJOSRpMwwV+z36wZ4CLJSUklRL95wHYAJwWpi9NWv9x4HOK8tmRNFVSSTf7fogo0+30sF2mJLftw0e47TPA5aHvYRzRJf/h9vs48G8d/QjhaqUnG8P2d0s6+Qja9gJwtqTjw3FKJE3tvFK4LZpoZk8BNxFdXQ07guPEhgd7kjCW3O+Bl4kGhVwYFt1GFNRLiSqVdvgF0X3uEkmvAD+jm7OnmTUT5X3fFy6bM+UbwO2SFhGdDY/EQ0R59quAuzn4Mrm7/X6TaOy55eGS+5uHO4iZvQp8DLhf0nGpNMzMdhFdCd0raXloW1ej4gwBfhNuB5YCPwwDa7hOPMW1B5JuAerN7LYM7CuPqMPvMjNbe7j1cyH0hD9iZg8cbl038PiZvQ9IOglYBzzRXwPdDX5+Zs8RSacQjeKSrMnM5nSz/kPA5E6zb+rcOZgLkt4DfKfT7DfM7JJctMd1zYPduZjwy3jnYsKD3bmY8GB3LiY82J2Lif8PXJB3bE1Kqo8AAAAASUVORK5CYII=",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.relplot(\n",
+ " data=p66_cd47_df,\n",
+ " x=\"query_n_unique_kmers\",\n",
+ " y=\"jaccard\",\n",
+ " height=3,\n",
+ " hue=\"ksize\",\n",
+ " palette=\"crest\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e94664e6-fe0c-45ec-9fd5-9b3cac851fa9",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1498b57-8a4f-4d45-81be-61ed007cdb53",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "21b8532b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}