Add rename flag to VDJ and ATAC pipelines, fix VDJ mm2024 genome entry

atac.smk, gex.smk, vdj.smk, Snakefile - Added the rename ability to VDJ and ATAC and make the pipelines consistent in parameter names genome.json - mm2024 vdj was previously accidentally pointing to human reference, mixed to point to mouse cell-seek and run.py - add in test to check the rename file format and check to see if input FASTQ files are included in the rename / libraries files; also throws an error if detects that rename / libraries is trying to call a FASTQ file not in the input run.md - documentation added to include the rename flag for GEX, VDJ, and ATAC
OpenOmics · Apr 18, 2024 · 7cf901b · 7cf901b
1 parent b4c067b
commit 7cf901b
Show file tree

Hide file tree

Showing 8 changed files with 321 additions and 41 deletions.
diff --git a/cell-seek b/cell-seek
@@ -39,7 +39,7 @@ import argparse  # potential python3 3rd party package, added in python/3.5
 
 # Local imports
 from src import version
-from src.run import init, setup, bind, dryrun, runner
+from src.run import init, setup, bind, dryrun, runner, finalcheck
 from src.shells import bash
 from src.utils import (
     Colors,
@@ -139,8 +139,16 @@ def run(sub_args):
         # Dryrun pipeline
         dryrun_output = dryrun(outdir = sub_args.output) # python3 returns byte-string representation
         print("\nDry-running {} pipeline:\n{}".format(_name, dryrun_output.decode("utf-8")))
+        for arg_check in ['rename', 'libraries']:
+            if config['options'][arg_check] != 'None':
+                finalcheck(config=config, flag=arg_check)
         sys.exit(0)
 
+    # Step 4b. Perform final check of input files
+    for arg_check in ['rename', 'libraries']:
+        if config['options'][arg_check] != 'None':
+            finalcheck(config=config, flag=arg_check)
+
     # Step 5. Orchestrate pipeline execution,
     # run pipeline in locally on a compute node
     # for debugging purposes or submit the master
@@ -542,14 +550,21 @@ def parsed_arguments(name, description):
           --rename RENAME
                                 Rename sample file. A CSV file containing the name of the FASTQ
                                 file and the new name of the sample. Only the samples listed in
-                                the CSV files will be ran. This flag is currently only applicable
-                                when dealing with GEX projects.
+                                the CSV files will be run. This flag is only applicable when
+                                dealing with GEX, VDJ, or ATAC projects. Renaming samples in the
+                                other pipelines can be achieved via the libraries file.
                                   Here is an example rename.csv file:
-                                    FASTQ,Sample
+                                    FASTQ,Name
                                     original_name1,new_name1
                                     original_name2,new_name1
                                     original_name3,new_name2
                                     original_name4,new_name3
+                                In this example, new_name3 has FASTQ files with two different
+                                names. With this input, both sets of FASTQ files will be used
+                                when processing the sample as new_name3. original_name4 will not
+                                be renamed. Any FASTQ file that does not have the name
+                                original_name1, original_name2, original_name3, or original_name4
+                                will not be run.
                                   Example: --rename rename.csv
 
         {3}{4}Orchestration options:{5}

diff --git a/config/genome.json b/config/genome.json
@@ -22,7 +22,7 @@
         "mm2024": {
             "gex_transcriptome": "/data/OpenOmics/references/cell-seek/mouse/refdata-gex-GRCm39-2024-A",
             "cite_transcriptome": "/data/OpenOmics/references/cell-seek/mouse/refdata-gex-GRCm39-2024-A",
-	    "vdj_ref": "/data/NCBR/references/cellranger_references/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.1.0/"
+	    "vdj_ref": "/data/NCBR/references/cellranger_references/refdata-cellranger-vdj-GRCm38-alts-ensembl-7.0.0/"
         }
     }
 }
diff --git a/docs/usage/run.md b/docs/usage/run.md
@@ -17,6 +17,7 @@ $ cell-seek run [--help] \
       [--aggregate {{mapped, none}}] [--exclude-introns] \
       [--library LIBRARIES] [--features FEATURES] \
       [--filter FILTER] [--metadata METADATA] [--create-bam] \
+      [--rename RENAME] \
       --input INPUT [INPUT ...] \
       --output OUTPUT \
       --pipeline {gex, ...} \
@@ -161,6 +162,31 @@ Each of the following arguments are optional, and do not need to be provided.
 >
 > ***Example:*** `--metadata metadata.csv`
 
+---
+  `--rename RENAME`
+> **Rename sample file.**
+> *type: file*
+>
+> Rename sample file. A CSV file containing the name of the FASTQ file and the new name of the sample. Only the samples listed in the CSV files will be run.
+>
+> *Here is an example rename.csv file:*
+> ```
+> FASTQ,Name
+> original_name1,new_name1
+> original_name2,new_name2
+> original_name3,new_name3
+> original_name3-2,new_name3
+> original_name4,original_name4
+> ```
+>
+> *Where:*
+>
+> - *FASTQ:* The name that is used in the FASTQ file
+> - *Name:* Unique sample ID that is the sample name used for Cell Ranger count.
+> 
+> In this example, new_name3 has FASTQ files with two different names. With this input, both sets of FASTQ files will be used when processing the sample as new_name3. original_name4 will not be renamed. Any FASTQ file that does not have the name original_name1, original_name2, original_name3, or original_name4 will not be run.
+>
+> ***Example:*** `--rename rename.csv`
 
 ### 2.2 VDJ
 
@@ -199,7 +225,7 @@ Each of the following arguments are required. Failure to provide a required argu
 > **Reference genome.**   
 > *type: string*
 >   
-> This option defines the reference genome of the samples. cell-seek does comes bundled with prebuilt reference files for human and mouse samples, e.g. hg38 or mm10. Since there is no 2024 release VDJ reference, if hg2024 or mm2024 is selected the VDJ reference CR 7.1 release will be used.
+> This option defines the reference genome of the samples. cell-seek does comes bundled with prebuilt reference files for human and mouse samples, e.g. hg38 or mm10. Since there is no 2024 release VDJ reference, if hg2024 or mm2024 is selected the VDJ reference CR 7.1 release will be used for human, and CR 7.0 release will be used for mouse.
 >
 > A custom reference genome can also be provided via a json file. Additional information for creating this json file can be found in [<code>cell-seek <b>genome</b></code>](../genome).
 >
@@ -218,7 +244,30 @@ Each of the following arguments are required. Failure to provide a required argu
 
 #### 2.2.2 Analysis Options
 
-The VDJ pipeline currently does not have any applicable analysis flags.
+  `--rename RENAME`
+> **Rename sample file.**
+> *type: file*
+>
+> Rename sample file. A CSV file containing the name of the FASTQ file and the new name of the sample. Only the samples listed in the CSV files will be run.
+>
+> *Here is an example rename.csv file:*
+> ```
+> FASTQ,Name
+> original_name1,new_name1
+> original_name2,new_name2
+> original_name3,new_name3
+> original_name3-2,new_name3
+> original_name4,original_name4
+> ```
+>
+> *Where:*
+>
+> - *FASTQ:* The name that is used in the FASTQ file
+> - *Name:* Unique sample ID that is the sample name used for Cell Ranger count.
+>
+> In this example, new_name3 has FASTQ files with two different names. With this input, both sets of FASTQ files will be used when processing the sample as new_name3. original_name4 will not be renamed. Any FASTQ file that does not have the name original_name1, original_name2, original_name3, or original_name4 will not be run.
+>
+> ***Example:*** `--rename rename.csv`
 
 ### 2.3 CITE
 
@@ -393,7 +442,7 @@ Each of the following arguments are required. Failure to provide a required argu
 > **Reference genome.**   
 > *type: string*
 >   
-> This option defines the reference genome of the samples. cell-seek does comes bundled with prebuilt reference files for human and mouse samples, The options hg38 or mm10 would select the 2020 release of the reference. The options hg2024 or mm2024 would select the 2024 release of the reference. More information about the officially released references can be found on the [10x Genomics website](https://www.10xgenomics.com/support/software/cell-ranger/latest/release-notes/cr-reference-release-notes). Since there is no 2024 released VDJ reference, if hg2024 or mm2024 is selected in a run that includes VDJ data, the VDJ reference CR 7.1 release will be used.
+> This option defines the reference genome of the samples. cell-seek does comes bundled with prebuilt reference files for human and mouse samples, The options hg38 or mm10 would select the 2020 release of the reference. The options hg2024 or mm2024 would select the 2024 release of the reference. More information about the officially released references can be found on the [10x Genomics website](https://www.10xgenomics.com/support/software/cell-ranger/latest/release-notes/cr-reference-release-notes). Since there is no 2024 released VDJ reference, if hg2024 or mm2024 is selected in a run that includes VDJ data, the VDJ reference CR 7.1 release will be used for human, and CR 7.0 release will be used for mouse.
 >
 > A custom reference genome can also be provided via a json file. Additional information for creating this json file can be found in [<code>cell-seek <b>genome</b></code>](../genome).
 >
@@ -586,7 +635,29 @@ Each of the following arguments are required. Failure to provide a required argu
 
 #### 2.5.2 Analysis Options
 
-The ATAC pipeline currently does not have any applicable analysis flags.
+> **Rename sample file.**
+> *type: file*
+>
+> Rename sample file. A CSV file containing the name of the FASTQ file and the new name of the sample. Only the samples listed in the CSV files will be run.
+>
+> *Here is an example rename.csv file:*
+> ```
+> FASTQ,Name
+> original_name1,new_name1
+> original_name2,new_name2
+> original_name3,new_name3
+> original_name3-2,new_name3
+> original_name4,original_name4
+> ```
+>
+> *Where:*
+>
+> - *FASTQ:* The name that is used in the FASTQ file
+> - *Name:* Unique sample ID that is the sample name used for Cell Ranger count.
+>
+> In this example, new_name3 has FASTQ files with two different names. With this input, both sets of FASTQ files will be used when processing the sample as new_name3. original_name4 will not be renamed. Any FASTQ file that does not have the name original_name1, original_name2, original_name3, or original_name4 will not be run.
+>
+> ***Example:*** `--rename rename.csv`
 
 ### 2.6 Multiome
 

diff --git a/src/run.py b/src/run.py
@@ -510,6 +510,7 @@ def _require(fields, d, lib):
     # makes it so the order of the
     # columns does not matter
     indices = {}
+
     with open(libraries_file) as fh:
         try:
             header = next(fh).strip().split(delimeter)
@@ -537,8 +538,6 @@ def _require(fields, d, lib):
     return config
 
 
-
-
 def check_reference_file(reference_file, flag, delimeter = ','):
     """Check reference information from the features
     or cmo reference file. The reference file is a CSV
@@ -600,6 +599,156 @@ def _require(fields, d, lib, flag):
         _require(['id', 'name', 'read', 'pattern', 'sequence', 'feature_type'], indices, reference_file, flag)
 
 
+
+def check_rename_file(config, rename_file, delimeter = ','):
+    """Check sample information from the rename file.
+    The rename file is a CSV file containing information
+    about each sample. It contains each sample's name
+    and its associated demultiplexed (FastQ) name. The
+    relationship between samples provided to the --input
+    option and samples listed in the rename file is 1:many.
+    This is because sets of FastQ files with different
+    names that are from the same sample. It contains each
+    unique set of FASTQ name and sample name.
+    @params config <dict>:
+        Config dictionary containing metadata to run pipeline
+    @params rename_file <string>:
+        rename file containing information about each sample
+    @params flag <string>:
+        Config flag that was used to provide the rename file
+
+    """
+    def _require(fields, d, lib):
+        """Private function that checks to see if all required fields
+        are provided in the reference file. If nan item in fields does
+        not  exist in d, then the user forget to add this required field.
+        """
+        missing = []
+        for f in fields:
+            try:
+                i = d[f]
+            except KeyError:
+                missing.append(f)
+                pass
+        if missing:
+            fatal(
+                f"Error: Missing required fields in --rename {{}} file!\n \
+                └── Please add information for the following field(s): {{}}".format(
+                    lib,
+                    ','.join([f.lower() for f in missing])
+                )
+            )
+
+        return
+
+    # Get file extension to determine
+    # the appropriate file delimeter
+    extension = os.path.splitext(rename_file)[-1].lower()
+    if extension in ['.tsv', '.txt', '.text', '.tab']:
+        # file is tab seperated
+        delimeter = '\t'
+    # Find index of file dynamically,
+    # makes it so the order of the
+    # columns does not matter
+    indices = {}
+    with open(rename_file) as fh:
+        try:
+            header = next(fh).strip().split(delimeter)
+        except StopIteration:
+            fatal(
+                f'Error: --rename {{}} cannot be empty!\n \
+            └── Please ensure the file is not empty before proceeding again.'.format(reference_file)
+            )
+        for i in range(len(header)):
+            colname = header[i].strip().lower()
+            indices[colname] = i
+        _require(['fastq', 'name'], indices, rename_file)
+
+
+
+def finalcheck(config, flag, delimeter=','):
+    """Check the contents of the rename or libraries
+    file against input. This function checks to see if
+    the input files are not used in the rename/libraries
+    file and prints a warning if that occurs. If either
+    file lists a FASTQ file or path that is not included
+    in the input and throws an error if that is detected.
+    @params config <dict>:
+        Config dictionary containing metadata to run pipeline
+    @params flag <string>:
+        Config flag that was used to provide the input file
+    """
+    filename = config['options'][flag]
+
+    extension = os.path.splitext(filename)[-1].lower()
+    if extension in ['.tsv', '.txt', '.text', '.tab']:
+        # file is tab seperated
+        delimeter = '\t'
+
+    # Find index of file dynamically,
+    # makes it so the order of the
+    # columns does not matter
+    indices = {}
+
+    # Dictionary holding unique contents from files to use for comparisons
+    contents = {}
+    with open(filename) as fh:
+        try:
+            header = next(fh).strip().split(delimeter)
+        except StopIteration:
+            fatal(
+                f'Error: --rename {{}} cannot be empty!\n \
+            └── Please ensure the file is not empty before proceeding again.'.format(reference_file)
+            )
+        for i in range(len(header)):
+            colname = header[i].strip().lower()
+            indices[colname] = i
+        for line in fh:
+            linelist = line.strip().split(delimeter)
+            for i in indices:
+                values = contents.get(i, set())
+                values.add(linelist[indices[i]])
+                contents[i] = values
+
+    # Compiles the sample names and fastq paths from the input (config)
+    samples  = set([re.sub("_S[0-9]+_L00[0-9]", "", i) for i in config['samples']])
+    fastq_paths = set([os.path.dirname(i) for i in config['options']['input']])
+
+    for index_name in indices:
+        comparison = contents[index_name]
+
+        #Check the FASTQ names against the sample (fastq) names provided in the input files
+        if index_name in ['sample', 'fastq']:
+            if samples != comparison:
+                if len(samples-comparison) > 0:
+                    print(f"\nWarning: Some FASTQs will be skipped! \nWarning: --{{}} {{}} does not contain values for all provided FASTQ files.\n \
+            └── Please note that no sample names have been provided for FASTQ files with the following id(s): {{}} \n \
+            These FASTQ files will be skipped when running the pipeline.".format(flag, filename, ','.join(samples-comparison)))
+                if len(comparison-samples) > 0:
+                    fatal(
+                        f'\nError: --{{}} {{}} contains values in FASTQ column that is not in the provided FASTQ files!\n \
+            └── Please note that the followed listed FASTQ names are not found in the input files: {{}} '.format(flag, filename, ','.join(comparison-samples))
+                    )
+
+        if index_name == 'flowcell':
+            # Check to see which values in file are not found in fastq_paths
+            missing_file = set([i for i in comparison if sum([i in fastq_path for fastq_path in fastq_paths]) == 0])
+
+            # Check to see which fastq_paths from input are not found in the flowcell values in file
+            missing_path = set([fastq_path for fastq_path in fastq_paths if sum([i in fastq_path for i in comparison]) == 0])
+
+            if len(missing_path) > 0:
+                print(f"\nWarning: Some FASTQs will be skipped! \nWarning: --{{}} {{}} does not contain values for all provided FASTQ paths.\n \
+            └── Please note that no samples contain flowcells that are on the following path(s): \n \
+            {{}} \n \
+            Any FASTQ files in these paths will be skipped when running the pipeline.".format(flag, filename, ','.join(missing_path)))
+            if len(missing_file) > 0:
+                fatal(
+                    f'\nError: --{{}} {{}} contains values in FASTQ column that is not in the provided FASTQ files!\n \
+            └── Please note that the followed listed FASTQ names are not found in the input files: {{}} '.format(flag, filename, ','.join(missing_file))
+                )
+
+
 def check_conditional_parameters(config):
     """Check the compiled config fictionary to ensure
     that any parameters that are only required for
@@ -710,6 +859,10 @@ def add_rawdata_information(sub_args, config, ifiles):
         reference = sub_args.cmo_reference
         check_reference_file(reference_file = reference, flag = "cmo_reference")
 
+    if sub_args.rename != None:
+        rename = sub_args.rename
+        check_rename_file(rename_file = rename, config=config)
+
     return config
 
 

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -28,7 +28,7 @@ features = config['options']['features']       # Features files for cellranger (
 libraries = config['options']['libraries']     # Libraries files for cellranger (not used in all pipelines)
 cmo_ref = config['options']['cmo_reference']     # CMO Reference files for cellranger (only used in multi pipeline)
 cmo_sample = config['options']['cmo_sample']     # CMO Sample files for cellranger (only used in multi pipelines)
-rename = config['options']['rename']            # File containing how to rename samples when running Cell Ranger analysis
+RENAME = config['options']['rename']            # File containing how to rename samples when running Cell Ranger analysis
 exclude_introns = str_bool(                           # Use introns for pre mRNA,
     config['options']['exclude_introns']              # default: False
 )
@@ -44,16 +44,17 @@ if 'libraries' in config:
 pipeline_output = []
 
 
-rename_dict = dict()
-if rename != 'None':
-    with open(rename) as f:
+RENAME_DICT = dict() #Dictionary containing information on renamed samples where the keys are the FASTQ file names and the values are the Cell Ranger sample names
+if RENAME != 'None':
+    with open(RENAME) as f:
         tabs = [i.lower() for i in next(f).strip().split(',')]
         index_fastq = [i for i in range(len(tabs)) if 'fastq' in tabs[i]][0]
-        index_sample = [i for i in range(len(tabs)) if 'sample' in tabs[i]][0]
+        index_sample = [i for i in range(len(tabs)) if 'name' in tabs[i]][0]
         for line in f:
             line = line.strip().split(',')
-            rename_dict[line[index_fastq]] = line[index_sample]
-    samples = list((set(samples) - set(rename_dict.keys())).union(rename_dict.values()))
+            RENAME_DICT[line[index_fastq]] = line[index_sample]
+    samples = list(set(RENAME_DICT.values()))
+#    samples = list((set(samples) - set(RENAME_DICT.keys())).union(RENAME_DICT.values()))
     samples.sort()
 
 # Import rules