Generate cylinter_report.yml file. (#179)

labsyspharm · Mar 14, 2024 · 30ab9a7 · 30ab9a7
1 parent d377773
commit 30ab9a7
Show file tree

Hide file tree

Showing 23 changed files with 3,991 additions and 2,895 deletions.
diff --git a/README.md b/README.md
@@ -3,10 +3,10 @@
 
 ## An Interactive Image Segmentation Filter for Multiplex Microscopy.
 
-CyLinter is quality control software for identifying and removing cell segmentation instances corrupted by optical and/or image-processing artifacts in multiplex microscopy images. The tool is user-guided and comprises a set of modular and extensible QC modules instantiated in a configurable [Python](https://www.python.org) Class object. Module results are cached to allow for dynamic restarts.
+CyLinter is quality control software for identifying and removing cell segmentation instances corrupted by optical and/or image-processing artifacts in multiplex microscopy images. The tool is interactive and comprises a set of modular and extensible QC modules instantiated in a configurable [Python](https://www.python.org) Class object. Module results are cached to allow for progress bookmarking and dynamic restarts.
 
 CyLinter development is led by [Greg Baker](https://github.com/gjbaker) at the [Laboratory of Systems Pharmacology](https://labsyspharm.org/), Harvard Medical School.
 
-**Funding:** This work was supported by Ludwig Cancer Research and the Ludwig Center at Harvard and by NIH NCI grants U2C-CA233280 (Omic and Multidimensional Spatial Atlas of Metastatic Breast and Prostate Cancers) and U2C-CA233262 (Pre-cancer atlases of cutaneous and hematologic origin—PATCH Center) to Peter K. Sorger and Sandro Santagata as part of the Human Tumor Atlas Network. Development of computational methods and image processing software is supported by a Team Science Grant from the Gray Foundation, the Gates Foundation grant INV-027106, the David Liposarcoma Research Initiative, and the Emerson Collective.
+**Funding:** This work was supported by the Ludwig Cancer Research and the Ludwig Center at Harvard (P.K.S., S.S.) and by NIH NCI grants U2C-CA233280, and U2C-CA233262 (P.K.S., S.S.). Development of computational methods and image processing software is supported by a Team Science Grant from the Gray Foundation (P.K.S., S.S.), the Gates Foundation grant INV-027106 (P.K.S.), the David Liposarcoma Research Initiative at Dana-Farber Cancer Institute supported by KBF Canada via the Rossy Foundation Fund (P.K.S., S.S.) and the Emerson Collective (P.K.S.). S.S. is supported by the BWH President’s Scholars Award.
 
-**Instructions:** https://labsyspharm.github.io/cylinter/
+**Project Website:** https://labsyspharm.github.io/cylinter/
diff --git a/cylinter/components.py b/cylinter/components.py
@@ -75,6 +75,7 @@ def __init__(self,
                  sampleStatuses=None,
                  sampleReplicates=None,
                  samplesToExclude=None,
+                 counterstainChannel=None,
                  markersToExclude=None,
 
                  # selectROIs -
@@ -104,9 +105,7 @@ def __init__(self,
                  embeddingAlgorithmQC=None,
                  channelExclusionsClusteringQC=None,
                  samplesToRemoveClusteringQC=None,
-                 fracForEmbeddingQC=None,
-                 dimensionEmbeddingQC=None,
-                 topMarkersQC=None,
+                 percentDataPerChunk=None,
                  colormapAnnotationQC=None,
                  metricQC=None,
                  perplexityQC=None,
@@ -142,7 +141,6 @@ def __init__(self,
                  normalizeTissueCounts=None,
                  fracForEmbedding=None,
                  dimensionEmbedding=None,
-                 topMarkers=None,
                  colormapAnnotationClustering=None,
                  colormapAnnotation=None,
                  perplexity=None,
@@ -163,14 +161,10 @@ def __init__(self,
 
                  # curateThumbnails —
                  numThumbnails=None,
-                 topMarkersThumbnails=None,
                  windowSize=None,
                  segOutlines=None,
                  ):
 
-        assert topMarkers in ['channels', 'clusters'], \
-            'Invalid input for topMarkers configuration parameter.'
-
         self.inDir = inDir
         self.outDir = outDir
         self.startModule = startModule
@@ -180,6 +174,7 @@ def __init__(self,
         self.sampleStatuses = sampleStatuses
         self.sampleReplicates = sampleReplicates
         self.samplesToExclude = samplesToExclude
+        self.counterstainChannel = counterstainChannel
         self.markersToExclude = markersToExclude
 
         self.delintMode = delintMode
@@ -203,9 +198,7 @@ def __init__(self,
         self.embeddingAlgorithmQC = embeddingAlgorithmQC
         self.channelExclusionsClusteringQC = channelExclusionsClusteringQC
         self.samplesToRemoveClusteringQC = samplesToRemoveClusteringQC
-        self.fracForEmbeddingQC = fracForEmbeddingQC
-        self.dimensionEmbeddingQC = dimensionEmbeddingQC
-        self.topMarkersQC = topMarkersQC
+        self.percentDataPerChunk = percentDataPerChunk
         self.colormapAnnotationQC = colormapAnnotationQC
         self.metricQC = metricQC
         self.perplexityQC = perplexityQC
@@ -237,7 +230,6 @@ def __init__(self,
         self.normalizeTissueCounts = normalizeTissueCounts
         self.fracForEmbedding = fracForEmbedding
         self.dimensionEmbedding = dimensionEmbedding
-        self.topMarkers = topMarkers
         self.colormapAnnotationClustering = colormapAnnotationClustering
         self.perplexity = perplexity
         self.earlyExaggeration = earlyExaggeration
@@ -255,7 +247,6 @@ def __init__(self,
         self.FDRCorrection = FDRCorrection
 
         self.numThumbnails = numThumbnails
-        self.topMarkersThumbnails = topMarkersThumbnails
         self.windowSize = windowSize
         self.segOutlines = segOutlines
 

diff --git a/cylinter/config.py b/cylinter/config.py
@@ -44,8 +44,9 @@ def from_path(cls, path):
         config.inDir = pathlib.Path(data['inDir']).resolve()
         config.outDir = pathlib.Path(data['outDir']).resolve()
         config._parse_sample_metadata(data['sampleMetadata'])
-        config.samplesToExclude = (data['samplesToExclude'])
-        config.markersToExclude = (data['markersToExclude'])
+        config.samplesToExclude = list(data['samplesToExclude'])
+        config.counterstainChannel = str(data['counterstainChannel'])
+        config.markersToExclude = list(data['markersToExclude'])
 
         # CLASS MODULE CONFIGURATIONS
 
@@ -96,12 +97,9 @@ def from_path(cls, path):
             data['samplesToRemoveClustering']
         )
         config.normalizeTissueCounts = bool(data['normalizeTissueCounts'])
-        config.fracForEmbeddingQC = float(data['fracForEmbeddingQC'])
+        config.percentDataPerChunk = float(data['percentDataPerChunk'])
         config.fracForEmbedding = float(data['fracForEmbedding'])
-        config.dimensionEmbeddingQC = int(data['dimensionEmbeddingQC'])
         config.dimensionEmbedding = int(data['dimensionEmbedding'])
-        config.topMarkersQC = str(data['topMarkersQC'])
-        config.topMarkers = str(data['topMarkers'])
         config.colormapAnnotationQC = str(
             data['colormapAnnotationQC'])
         config.colormapAnnotationClustering = str(
@@ -136,7 +134,6 @@ def from_path(cls, path):
         config.FDRCorrection = bool(data['FDRCorrection'])
 
         config.numThumbnails = int(data['numThumbnails'])
-        config.topMarkersThumbnails = str(data['topMarkersThumbnails'])
         config.windowSize = int(data['windowSize'])
         config.segOutlines = bool(data['segOutlines'])
 

diff --git a/cylinter/config.yml → cylinter/cylinter_config.yml b/cylinter/config.yml → cylinter/cylinter_config.yml
@@ -2,8 +2,8 @@
 
 inDir: /Users/<username>/Desktop/cylinter_demo
 # Path to CyLinter input directory containing multi-channel
-# image files (TIFF or OME-TIFF), segmentation outlines (OME-TIFF),
-# segmentation masks (TIFF), and corresponding single-cell feature tables (CSV)
+# image files (TIF or OME-TIF), segmentation outlines (OME-TIF),
+# segmentation masks (TIF), and corresponding single-cell feature tables (CSV)
 
 outDir: /Users/<username>/Desktop/cylinter_demo/output
 # CyLinter output directory. Path is created if it does not exist.
@@ -26,6 +26,9 @@ samplesToExclude: []
 # (list of strs) Sample names to exclude from analysis specified
 # according to the first elements of sampleMetadata configuration.
 
+counterstainChannel: "DNA1"
+# (str) Name of marker in markers.csv file for use in visualizing nuclear counterstain
+
 markersToExclude: ["Rabbit IgG", "Goat IgG", "Mouse IgG", "CD56", "CD13",
                    "pAUR", "CCNE", "CDKN2A", "PCNA_1", "CDKN1B_2",
                    "CD63", "CD32", "CCNA2", "CDKN1C", "PCNA_1",

diff --git a/cylinter/modules/PCA.py b/cylinter/modules/PCA.py
@@ -29,8 +29,10 @@ def PCA(data, self, args):
     check, markers_filepath = input_check(self)
 
     # read marker metadata
-    markers, dna1, dna_moniker, abx_channels = read_markers(
-        markers_filepath=markers_filepath, markers_to_exclude=self.markersToExclude, data=data
+    markers, abx_channels = read_markers( 
+        markers_filepath=markers_filepath,
+        counterstain_channel=self.counterstainChannel,
+        markers_to_exclude=self.markersToExclude, data=None
     )
 
     # drop antibody channel exclusions for PCA
@@ -188,7 +190,7 @@ def PCA(data, self, args):
                    markersize=5.0, linewidth=5)
         )
         ax1.legend(handles=legend_handles, prop={'size': 10.0}, bbox_to_anchor=[0.95, 1.0])
-        fig1.savefig(os.path.join(pca_dir, 'variance.pdf'), bbox_inches='tight')
+        fig1.savefig(os.path.join(pca_dir, 'horns_analysis.pdf'), bbox_inches='tight')
         plt.close(fig1)
 
         ###################################################################
@@ -216,7 +218,7 @@ def PCA(data, self, args):
         ax2.tick_params(axis='both', which='major', labelsize=7.0)
 
         fig2.savefig(
-            os.path.join(pca_dir, 'pcaScoresPlotCells.png'), dpi=600, bbox_inches='tight'
+            os.path.join(pca_dir, 'pca_cells.png'), dpi=600, bbox_inches='tight'
         )
         plt.close(fig2)
 
@@ -492,13 +494,16 @@ def get_key(val):
 
             # save figure
             plt.savefig(
-                os.path.join(pca_dir, 'pcaScoresPlotSamples.pdf'),
+                os.path.join(pca_dir, 'pca_samples.pdf'),
                 bbox_inches='tight')
             plt.close('all')
 
             data = reorganize_dfcolumns(data, markers, self.dimensionEmbedding)
     else:
-        logging.info("n_components = 1, skipping PCA and Horn's parallel analysis.")
+        logging.info(
+            "n_components = 1. Only 1 sample (or 1 marker) in analysis. "
+            "Skipping PCA and Horn's parallel analysis."
+        )
 
     print()
     print()

diff --git a/cylinter/modules/aggregateData.py b/cylinter/modules/aggregateData.py
@@ -1,3 +1,6 @@
+import os
+import sys
+import yaml
 import logging
 
 import pandas as pd
@@ -13,10 +16,18 @@ def aggregateData(data, self, args):
 
     check, markers_filepath = input_check(self)
 
-    markers, dna1, dna_moniker, abx_channels = read_markers(
-        markers_filepath=markers_filepath, markers_to_exclude=self.markersToExclude, data=None
+    markers, abx_channels = read_markers( 
+        markers_filepath=markers_filepath,
+        counterstain_channel=self.counterstainChannel,
+        markers_to_exclude=self.markersToExclude, data=None
     )
 
+    # initialize CyLinter QC report if it hasn't been already
+    report_path = os.path.join(self.outDir, 'cylinter_report.yml')
+    if not os.path.exists(report_path):
+        f = open(report_path, 'w')
+        yaml.dump({}, f)
+
     df_list = []
     channel_setlist = []
     sample_keys = [i for i in self.sampleNames.keys()]
@@ -41,11 +52,10 @@ def aggregateData(data, self, args):
 
             # select boilerplate columns
             cols = (
-                ['CellID', 'X_centroid', 'Y_centroid', 'Area',
-                 'MajorAxisLength', 'MinorAxisLength',
-                 'Eccentricity', 'Solidity', 'Extent',
-                 'Orientation'] +
-                [i for i in markers['marker_name'] if i in csv.columns]
+                [i for i in [j for j in markers['marker_name']] +
+                 [i for i in ['CellID', 'X_centroid', 'Y_centroid', 'Area', 'MajorAxisLength',
+                              'MinorAxisLength', 'Eccentricity', 'Solidity', 'Extent', 
+                              'Orientation'] if i in csv.columns]]
             )
 
             # (for BAF project)
@@ -98,8 +108,16 @@ def aggregateData(data, self, args):
             #      'Orientation'] +
             #     [f'{i}_{mask_dict[i]}' for i
             #      in markers['marker_name']])
-
-            csv = csv[cols]
+
+            try:
+                csv = csv[cols]
+            except KeyError as e:
+                logger.info(
+                    'Aborting; some (or all) marker names in markers.csv do not appear '
+                    'as columns in the single-cell data table. Check for spelling and case.'
+                )
+                print(e)
+                sys.exit()
 
             # (for SARDANA)
             # trim mask object names from column headers
@@ -159,7 +177,7 @@ def aggregateData(data, self, args):
     data.reset_index(drop=True, inplace=True)
 
     # ensure MCMICRO-generated columns come first and
-    # are in the same order as csv input
+    # are in the same order as csv feature tables
     data = reorganize_dfcolumns(data, markers, self.dimensionEmbedding)
 
     print()