Small fixes to sample QC notebook (#773)

broadinstitute · Jan 31, 2025 · d9a905e · d9a905e
1 parent 7d4a014
commit d9a905e
Showing 1 changed file with 99 additions and 66 deletions.
diff --git a/scripts/notebooks/SampleQC.ipynb b/scripts/notebooks/SampleQC.ipynb
@@ -26,14 +26,15 @@
     "**Next Steps**: Batching, TrainGCNV.\n",
     "\n",
     "**Legend**:\n",
-    "<div class=\"alert alert-block alert-info\"> <b>Blue Boxes for One-Time Runs</b>: Uncomment and run the code cell directly below just once, then re-comment the cell to skip it next time. These cells typically save intermediate outputs locally so that the notebook can be paused and picked back up without a significant delay.</div>\n",
+    "<div class=\"alert alert-block alert-info\"> <b>Blue Boxes for One-Time Runs</b>: Run the code cell directly below just once. To skip it next time, you can select the cell's contents and comment them out (see below for keyboard shortcuts). These cells typically save intermediate outputs locally so that the notebook can be paused and picked back up without a significant delay.</div>\n",
     "<div class=\"alert alert-block alert-success\"> <b>Green Boxes for User Inputs</b>: Edit the inputs provided in the code cell directly below. The inputs that are editable are defined in all capitals, and their descriptions can be found in the section headers. </div>\n",
     "\n",
     "**Execution Tips**:\n",
-    "* The first time you start this notebook (one time only), you will need to uncomment and run the package installation cell under *Imports*.\n",
+    "* The first time you start this notebook (one time only), you will need to run the package installation cell under *Imports*. Then, we recommend commenting out the package installation to skip it next time.\n",
     "* Once the packages are installed, to quickly run all the cells containing helper functions, constants, and imports, skip to the first cell of *Data Ingestion*, click \"Cell\" in the toolbar at the top of the notebook, and select \"Run All Above.\" Then, starting from *Data Ingestion*, proceed step-by-step through the notebook.\n",
     "* The keyboard shortcut to run a cell is `Shift`+`Return`.\n",
-    "* The keyboard shortcut to comment or uncomment an enitre cell is `Command`+`/` or `Control`+`/`."
+    "* To select all the lines of code in a cell, click on the cell and then click `Command`+`A` or `Control`+`A`.\n",
+    "* The keyboard shortcut to comment or uncomment all selected lines is `Command`+`/` or `Control`+`/`."
    ]
   },
   {
@@ -52,7 +53,7 @@
     "hidden": true
    },
    "source": [
-    "<div class=\"alert alert-block alert-info\">Uncomment and run once. It is not necessary to reinstall these packages each time you restart your cloud environment.</div>    "
+    "<div class=\"alert alert-block alert-info\">Run once. It is not necessary to reinstall these packages each time you restart your cloud environment.</div>    "
    ]
   },
   {
@@ -67,7 +68,7 @@
    },
    "outputs": [],
    "source": [
-    "# ! pip install upsetplot"
+    "! pip install upsetplot"
    ]
   },
   {
@@ -171,7 +172,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "heading_collapsed": true
+   },
    "source": [
     "# Helper Functions\n",
     "This section instantiates helper functions used throughout the notebook."
@@ -180,7 +183,8 @@
   {
    "cell_type": "markdown",
    "metadata": {
-    "heading_collapsed": true
+    "heading_collapsed": true,
+    "hidden": true
    },
    "source": [
     "## File System Functions"
@@ -310,7 +314,8 @@
   {
    "cell_type": "markdown",
    "metadata": {
-    "heading_collapsed": true
+    "heading_collapsed": true,
+    "hidden": true
    },
    "source": [
     "## Processing Functions"
@@ -437,7 +442,8 @@
   {
    "cell_type": "markdown",
    "metadata": {
-    "heading_collapsed": true
+    "heading_collapsed": true,
+    "hidden": true
    },
    "source": [
     "## Validation Functions"
@@ -499,6 +505,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
+    "code_folding": [],
     "hidden": true
    },
    "outputs": [],
@@ -735,15 +742,19 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "heading_collapsed": true,
+    "hidden": true
+   },
    "source": [
     "## QC Functions"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {
-    "heading_collapsed": true
+    "heading_collapsed": true,
+    "hidden": true
    },
    "source": [
     "### General Functions"
@@ -794,14 +805,15 @@
    },
    "outputs": [],
    "source": [
-    "def plot_histogram(filter_type_data, filter_type, filter_name, cutoffs=None, line_styles=None, log_scale=False, **kwargs):\n",
+    "def plot_histogram(filter_type_data, filter_type, filter_name, filtered=False, cutoffs=None, line_styles=None, log_scale=False, **kwargs):\n",
     "    \"\"\"\n",
     "    Plots a histogram for a given data array and saves the plot as an image.\n",
     "\n",
     "    Args:\n",
     "        filter_type_data (np.array): The data to plot the histogram for.\n",
     "        filter_type (str): The type of data being plotted (e.g., \"age\", \"height\", etc).\n",
     "        filter_name (str): The name of the metric being plotted.\n",
+    "        filtered (bool): Include \"(Filtered)\" in plot title.\n",
     "        cutoffs (list): List of cutoff values for which to plot vertical lines.\n",
     "        line_styles (list): List of line styles corresponding to each cutoff.\n",
     "        log_scale (bool): Defines whether to log-scale the plot.\n",
@@ -843,7 +855,10 @@
     "    plt.gca().yaxis.grid(True, zorder=0, color='lightgrey')\n",
     "\n",
     "    # Set the title and axes\n",
-    "    plt.title(f\"{filter_name} - {len(filter_type_data)} Samples (Filtered)\", fontsize=16)\n",
+    "    to_append = \"\"\n",
+    "    if filtered:\n",
+    "        to_append = \" (Filtered)\"\n",
+    "    plt.title(f\"{filter_name} - {len(filter_type_data)} Samples{to_append}\", fontsize=16)\n",
     "    plt.xlabel(f\"{filter_name}\", fontsize=12)\n",
     "    plt.ylabel(\"Sample Count (Log Scale)\" if log_scale else \"Sample Count\", fontsize=12)\n",
     "\n",
@@ -998,7 +1013,7 @@
     "    filter_pass_data = filter_pass[filter_type]\n",
     "    if (filter_type == 'median_coverage'):\n",
     "        filter_pass_data = filter_pass_data * (read_length / 100)\n",
-    "    file_png = plot_histogram(filter_pass_data, filter_type, filter_name, log_scale=log_scale, **kwargs)\n",
+    "    file_png = plot_histogram(filter_pass_data, filter_type, filter_name, filtered=True, log_scale=log_scale, **kwargs)\n",
     "    \n",
     "    # Display plot\n",
     "    if display:\n",
@@ -1077,7 +1092,8 @@
   {
    "cell_type": "markdown",
    "metadata": {
-    "heading_collapsed": true
+    "heading_collapsed": true,
+    "hidden": true
    },
    "source": [
     "### Autosomal Copy Number Functions"
@@ -1284,7 +1300,8 @@
   {
    "cell_type": "markdown",
    "metadata": {
-    "heading_collapsed": true
+    "heading_collapsed": true,
+    "hidden": true
    },
    "source": [
     "### Sex Analysis Functions"
@@ -1448,6 +1465,9 @@
     "    differences = reference_ped[reference_ped['Sex'] != reference_ped['Computed_Sex']]\n",
     "    differences = differences[['Family_ID', 'Sample_ID', 'Paternal_ID', 'Maternal_ID', 'Sex', 'Phenotype']]\n",
     "    \n",
+    "    # Print number of differences\n",
+    "    print(f\"Found {len(differences)} differences between the computed sex and the sex in the reference PED file.\")\n",
+    "    \n",
     "    # Save differences file\n",
     "    differences = differences.reset_index(drop=True)\n",
     "    save_df(WS_BUCKET, file_path, differences)\n",
@@ -1562,7 +1582,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<div class=\"alert alert-block alert-info\">Uncomment and run once. Once this step has run, if you need to load the table again, use the following cell.</div>"
+    "<div class=\"alert alert-block alert-info\">Run once. Once this step has run, if you need to load the table again, use the following cell.</div>"
    ]
   },
   {
@@ -1573,34 +1593,35 @@
    },
    "outputs": [],
    "source": [
-    "# sample_set_response = fapi.get_entities_tsv(\n",
-    "#     NAMESPACE, WORKSPACE, \"sample_set\", \n",
-    "#     attrs=[\"ploidy_plots\", \"qc_table\"], model=\"flexible\"\n",
-    "# )\n",
+    "sample_set_response = fapi.get_entities_tsv(\n",
+    "    NAMESPACE, WORKSPACE, \"sample_set\", \n",
+    "    attrs=[\"ploidy_plots\", \"qc_table\"], model=\"flexible\"\n",
+    ")\n",
     "\n",
-    "# with open('sample_set.zip', 'wb') as f:\n",
-    "#     f.write(sample_set_response.content)\n",
-    "    \n",
-    "# with zipfile.ZipFile('sample_set.zip', 'r') as zip_ref:\n",
-    "#     # Extract sample set data\n",
-    "#     with zip_ref.open('sample_set_entity.tsv') as file:\n",
-    "#         tsv_file = io.StringIO(file.read().decode('utf-8'))\n",
-    "#         sample_set_tbl = pd.read_csv(tsv_file, sep='\\t')\n",
-    "#         sample_set_tbl = sample_set_tbl[sample_set_tbl['ploidy_plots'].notnull() & sample_set_tbl['qc_table'].notnull()]\n",
-    "#         sample_set_tbl = sample_set_tbl.reset_index(drop=True)\n",
+    "with open('sample_set.zip', 'wb') as f:\n",
+    "    f.write(sample_set_response.content)\n",
+    "\n",
+    "with zipfile.ZipFile('sample_set.zip', 'r') as zip_ref:\n",
+    "    # Extract sample set data\n",
+    "    with zip_ref.open('sample_set_entity.tsv') as file:\n",
+    "        tsv_file = io.StringIO(file.read().decode('utf-8'))\n",
+    "        sample_set_tbl = pd.read_csv(tsv_file, sep='\\t')\n",
+    "        sample_set_tbl = sample_set_tbl[sample_set_tbl['ploidy_plots'].notnull() & sample_set_tbl['qc_table'].notnull()]\n",
+    "        sample_set_tbl = sample_set_tbl.reset_index(drop=True)\n",
     "    \n",
-    "#     # Extract sample membership data\n",
-    "#     with zip_ref.open('sample_set_membership.tsv') as membership_file:\n",
-    "#         membership_tsv = io.StringIO(membership_file.read().decode('utf-8'))\n",
-    "#         membership_df = pd.read_csv(membership_tsv, sep='\\t')\n",
+    "    # Extract sample membership data\n",
+    "    with zip_ref.open('sample_set_membership.tsv') as membership_file:\n",
+    "        membership_tsv = io.StringIO(membership_file.read().decode('utf-8'))\n",
+    "        membership_df = pd.read_csv(membership_tsv, sep='\\t')\n",
     "    \n",
-    "#     # Add list of samples to corresponding sample set\n",
-    "#     sample_groups = membership_df.groupby('membership:sample_set_id')['sample'].unique().apply(list)\n",
-    "#     sample_set_tbl['samples'] = sample_set_tbl['entity:sample_set_id'].map(sample_groups)\n",
-    "#     sample_set_tbl['samples'] = sample_set_tbl['samples'].apply(lambda x: x if isinstance(x, list) else [])\n",
+    "    # Add list of samples to corresponding sample set\n",
+    "    sample_groups = membership_df.groupby('membership:sample_set_id')['sample'].unique().apply(list)\n",
+    "    sample_set_tbl['samples'] = sample_set_tbl['entity:sample_set_id'].map(sample_groups)\n",
+    "    sample_set_tbl['samples'] = sample_set_tbl['samples'].apply(lambda x: x if isinstance(x, list) else [])\n",
     "\n",
-    "# file_path = generate_file_path(TLD_PATH, 'artifacts', 'sample_sets_qc.tsv')\n",
-    "# save_df(WS_BUCKET, file_path, sample_set_tbl)"
+    "file_path = generate_file_path(TLD_PATH, 'artifacts', 'sample_sets_qc.tsv')\n",
+    "save_df(WS_BUCKET, file_path, sample_set_tbl)\n",
+    "sample_set_tbl"
    ]
   },
   {
@@ -1621,7 +1642,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<div class=\"alert alert-block alert-success\"><b>Optional</b>: If you wish to only include a subset of batches out of the ones listed above, you can filter <tt>sample_set_tbl</tt> to only include them using the cell below. Ensure that all batches you expect are included.</div>"
+    "<div class=\"alert alert-block alert-success\"><b>Optional</b>: If you wish to only include a subset of batches out of the ones listed above, you can filter <tt>sample_set_tbl</tt> to only include them using the cell below. Ensure that all batches you expect are included by running the following cell to display them.</div>"
    ]
   },
   {
@@ -1630,8 +1651,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Optionally filter to only include a subset of batches from sample_set_tbl and update the saved file. For example:\n",
-    "sample_set_tbl = sample_set_tbl[sample_set_tbl['entity:sample_set_id'].str.contains('KJ_EvidenceQC_Updates')]"
+    "# Optionally filter to only include a subset of batches from sample_set_tbl and update the saved file\n",
+    "# For example, to select batches based on a shared sample_set_id substring, set SUBSTRING below\n",
+    "SUBSTRING = None\n",
+    "if SUBSTRING is not None:\n",
+    "    sample_set_tbl = sample_set_tbl[sample_set_tbl['entity:sample_set_id'].str.contains(SUBSTRING)]\n",
+    "    \n",
+    "    file_path = generate_file_path(TLD_PATH, 'artifacts', 'sample_sets_qc.tsv')\n",
+    "    save_df(WS_BUCKET, file_path, sample_set_tbl)"
    ]
   },
   {
@@ -1640,9 +1667,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "file_path = generate_file_path(TLD_PATH, 'artifacts', 'sample_sets_qc.tsv')\n",
-    "save_df(WS_BUCKET, file_path, sample_set_tbl)\n",
-    "\n",
     "# Output batch information\n",
     "print(f\"Sample Set DataFrame Dimensions: {sample_set_tbl.shape}\")\n",
     "print(f\"Batch Count: {len(sample_set_tbl)}\\n\")\n",
@@ -1664,7 +1688,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<div class=\"alert alert-block alert-info\">Uncomment and run once. Once this step has run, if you need to load the table again, use the next cell.</div>"
+    "<div class=\"alert alert-block alert-info\">Run once. Once this step has run, if you need to load the table again, use the next cell.</div>"
    ]
   },
   {
@@ -1673,15 +1697,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# samples_qc_table = pd.concat([pd.read_csv(f, sep='\\t') for f in sample_set_tbl['qc_table']], ignore_index = True)\n",
+    "samples_qc_table = pd.concat([pd.read_csv(f, sep='\\t') for f in sample_set_tbl['qc_table']], ignore_index = True)\n",
     "\n",
-    "# print(f\"Sample DataFrame Dimensions: {samples_qc_table.shape}\")\n",
-    "# print(f\"Sample Count: {len(samples_qc_table)}\\n\")\n",
+    "print(f\"Sample DataFrame Dimensions: {samples_qc_table.shape}\")\n",
+    "print(f\"Sample Count: {len(samples_qc_table)}\\n\")\n",
     "\n",
-    "# validate_unique_samples(samples_qc_table)\n",
+    "validate_unique_samples(samples_qc_table)\n",
     "\n",
-    "# file_path = generate_file_path(TLD_PATH, 'artifacts', 'samples_qc.tsv')\n",
-    "# save_df(WS_BUCKET, file_path, samples_qc_table)"
+    "file_path = generate_file_path(TLD_PATH, 'artifacts', 'samples_qc.tsv')\n",
+    "save_df(WS_BUCKET, file_path, samples_qc_table)"
    ]
   },
   {
@@ -1710,7 +1734,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<div class=\"alert alert-block alert-info\">Uncomment and run once. Once this step has run, if you need to load the table again, use the next cell.</div>"
+    "<div class=\"alert alert-block alert-info\">Run once. Once this step has run, if you need to load the table again, use the next cell.</div>"
    ]
   },
   {
@@ -1721,17 +1745,17 @@
    },
    "outputs": [],
    "source": [
-    "# dir_path = os.path.join(TLD_PATH, \"ploidy\")\n",
-    "# ploidy_dirs = process_ploidy_data(sample_set_tbl, 'ploidy_plots', dir_path)\n",
+    "dir_path = os.path.join(TLD_PATH, \"ploidy\")\n",
+    "ploidy_dirs = process_ploidy_data(sample_set_tbl, 'ploidy_plots', dir_path)\n",
     "\n",
-    "# # Write the directory names to a file\n",
-    "# file_path = os.path.join(TLD_PATH, \"ploidy\", \"ploidy_dirs.list\")\n",
-    "# with open(file_path, 'w') as dirs_file:\n",
-    "#     for ploidy_dir in ploidy_dirs:\n",
-    "#         dirs_file.write(ploidy_dir + '\\n')\n",
+    "# Write the directory names to a file\n",
+    "file_path = os.path.join(TLD_PATH, \"ploidy\", \"ploidy_dirs.list\")\n",
+    "with open(file_path, 'w') as dirs_file:\n",
+    "    for ploidy_dir in ploidy_dirs:\n",
+    "        dirs_file.write(ploidy_dir + '\\n')\n",
     "\n",
-    "# # Get binwise copy number files\n",
-    "# binwise_cn_files = [os.path.join(ploidy_dir, \"ploidy_est\", \"binwise_estimated_copy_numbers.bed.gz\") for ploidy_dir in ploidy_dirs]"
+    "# Get binwise copy number files\n",
+    "binwise_cn_files = [os.path.join(ploidy_dir, \"ploidy_est\", \"binwise_estimated_copy_numbers.bed.gz\") for ploidy_dir in ploidy_dirs]"
    ]
   },
   {
@@ -1856,9 +1880,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Visualize the median coverage data before filtering\n",
     "# Example use of **kwargs to pass in parameter `bins` to the histogram plotting function\n",
     "run_analysis(samples_qc_table, 'median_coverage', LINE_DEVIATIONS, LINE_STYLES, READ_LENGTH, \n",
-    "             log_scale=LOG_SCALE, bins=50)"
+    "             log_scale=LOG_SCALE, bins=30)"
    ]
   },
   {
@@ -1900,9 +1925,10 @@
    },
    "outputs": [],
    "source": [
+    "# Perform filtering with the defined cutoffs and plot the median coverage for the passing samples\n",
     "# Example use of **kwargs to pass in parameter `bins` to the histogram plotting function\n",
     "run_filtering(samples_qc_table, 'median_coverage', METHOD, lower_cutoff=LOWER_CUTOFF, upper_cutoff=UPPER_CUTOFF, \n",
-    "              mad_cutoff=MAD_CUTOFF, read_length=READ_LENGTH, log_scale=LOG_SCALE, bins=50)"
+    "              mad_cutoff=MAD_CUTOFF, read_length=READ_LENGTH, log_scale=LOG_SCALE, bins=30)"
    ]
   },
   {
@@ -2790,6 +2816,13 @@
     "tsv = pd.read_csv(file_path, sep='\\t')\n",
     "tsv"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {