Update to README & allows pipeline to evaluate .Rtab gene presence ma…

…trices.
maxgmarin · Mar 26, 2024 · b2360f6 · b2360f6
1 parent db2dce6
commit b2360f6
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -69,6 +69,13 @@ panqc nrc -a InputAsmPaths.tsv -r pan_genome_reference.fa -m gene_presence_absen
 
 The above command will output an adjusted gene presence absence matrix along with additional statistics to the specified output directory (`NRC_results/`).
 
+
+Alternatively, if you would like to use a `gene_presence_absence.Rtab` file instead of a CSV matrix of gene presence, use add the `--is-rtab` flag.
+
+```
+panqc nrc -a InputAsmPaths.tsv -r pan_genome_reference.fa -m gene_presence_absence.Rtab --is-rtab -o NRC_results/
+```
+
 ### Analyzing included test data set
 
 If you wish to run an `panqc nrc` on an artifical (and abridged) test data set, you can run the following commands:
@@ -119,7 +126,7 @@ optional arguments:
                         Input pan-genome nucleotide reference. Typically output as `pan_genome_reference.fasta` by Panaroo/Roary (FASTA)
 
   -m, --gene_matrix gene_presence_absence.csv
-                        Input pan-genome gene presence/absence matrix. Typically output as `gene_presence_absence.csv` by Panaroo/Roary (CSV)
+                        Input pan-genome gene presence/absence matrix. By default is assumed to be a `gene_presence_absence.csv` output by Panaroo/Roary (CSV) If the user provides the --is-rtab flag, the input is assumed to be an .Rtab (TSV)file.
 
   -o, --results_dir RESULTS_DIR
                         Output directory for analysis results.
@@ -138,6 +145,7 @@ optional arguments:
 
   -t, --min-ksim MIN_KSIM
                         Minimum k-mer similarity (maximum jaccard containment of k-mers between pair of sequences) to cluster sequences into the same "nucleotide similarity cluster" (Default: 0.8))
+  --is-rtab             Flag indicating that the input gene matrix is a tab-delimited .Rtab file
 ```
 
 
@@ -165,6 +173,7 @@ optional arguments:
 
 ```
 
+
 >🚧 Check back soon for full usage for each of the utility sub-pipelines of the panqc toolkit 🚧
 
 

diff --git a/panqc/__init__.py b/panqc/__init__.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ### Authors: Max Marin ([email protected])
-# Pan-genome QC toolkit (PQGC)
+# Pan-genome QC (panqc) toolkit 
 
 
 import sys
@@ -22,21 +22,23 @@ def _nrc_cli(args):
     ## 1) Set input parameters and PATHs ####
     input_Assemblies_TSV = args.asms
     input_PG_Ref_FA = args.pg_ref
-    input_PresAbs_CSV = args.gene_matrix
+    input_PresAbs_File = args.gene_matrix
     results_dir = args.results_dir
     min_query_cov = args.min_query_cov
     min_seq_id = args.min_seq_id
     kmer_size = args.kmer_size
     ksim_cluster_thresh = args.min_ksim
     prefix = args.prefix
+    rtab_input = args.is_rtab
 
     ## 2) Run the assembly sequence check function ####
 
-    Gene_PresAbs_WiAsmSeqCheck_DF = asmseqcheck_frompaths(input_PresAbs_CSV,
-                                                        input_PG_Ref_FA,
-                                                        input_Assemblies_TSV,
-                                                        min_query_cov,
-                                                        min_seq_id)
+    Gene_PresAbs_WiAsmSeqCheck_DF = asmseqcheck_frompaths(input_PresAbs_File,
+                                                         input_PG_Ref_FA,
+                                                         input_Assemblies_TSV,
+                                                         min_query_cov,
+                                                         min_seq_id,
+                                                         rtab_input)
 
     # 3) Print the general QC Stats 
     ASC_Stats_DF = get_AsmSeqCheck_QCStatsDF(Gene_PresAbs_WiAsmSeqCheck_DF)
@@ -124,11 +126,11 @@ def _asmseqcheck_cli(args):
     ## 1) Set input parameters and PATHs ####
 
     # Define input paths
-    input_PG_Ref_FA = args.in_pg_ref
+    input_PG_Ref_FA = args.pg_ref
 
-    input_AsmFA_TSV = args.in_assemblies
+    input_Assemblies_TSV = args.asms
 
-    input_PresAbs_CSV = args.in_gene_matrix
+    input_PresAbs_File = args.gene_matrix
 
     # Define output path
     output_PresAbs_WiDNASeqCheck = args.out_gene_matrix_wi_geneseqcheck
@@ -137,16 +139,18 @@ def _asmseqcheck_cli(args):
     min_query_cov = args.min_query_cov
     min_seq_id = args.min_seq_id
 
-    ## 2) Run the assembly sequence check function ####
+    # Set variable which defines whether the input gene matrix is an .Rtab file (or CSV file)
+    rtab_input = args.is_rtab
 
-    Gene_PresAbs_WiAsmSeqCheck_DF = asmseqcheck_frompaths(input_PresAbs_CSV,
-                                                          input_PG_Ref_FA,
-                                                          input_AsmFA_TSV,
-                                                          min_query_cov,
-                                                          min_seq_id)
+    ## 2) Run the assembly sequence check function ####
+    Gene_PresAbs_WiAsmSeqCheck_DF = asmseqcheck_frompaths(input_PresAbs_File,
+                                                         input_PG_Ref_FA,
+                                                         input_Assemblies_TSV,
+                                                         min_query_cov,
+                                                         min_seq_id,
+                                                         rtab_input)
 
     # 3) Print the general QC Stats
-
     _ = get_AsmSeqCheck_QCStats(Gene_PresAbs_WiAsmSeqCheck_DF)
 
 
@@ -195,7 +199,6 @@ def _nscluster_cli(args):
 
 
 
-# fg("blue") 
 def main():
     parser = argparse.ArgumentParser(description = "Toolkit for focused on augmenting common CDS based pan-genome analysis with nucleotide sequence comparison.")
     sub_parser_1 = parser.add_subparsers(required=True, help='Please select one of the pipelines of the PanQC toolkit.')
@@ -209,7 +212,7 @@ def main():
                                                     help="Input pan-genome nucleotide reference. Typically output as `pan_genome_reference.fasta` by Panaroo/Roary (FASTA)")
 
     nrc_parser.add_argument('-m', '--gene_matrix', type=str, required=True, metavar="gene_presence_absence.csv",
-                                                    help="Input pan-genome gene presence/absence matrix. Typically output as `gene_presence_absence.csv` by Panaroo/Roary (CSV)")
+                                                    help="Input pan-genome gene presence/absence matrix. By default is assumed to be a `gene_presence_absence.csv` output by Panaroo/Roary (CSV) \n If the user provides the --is-rtab flag, the input is assumed to be an .Rtab (TSV) file.")
 
     nrc_parser.add_argument('-o', '--results_dir', type=str, required=True,
                                                     help="Output directory for analysis results.")
@@ -228,7 +231,10 @@ def main():
 
     nrc_parser.add_argument('-t', '--min-ksim',type=float, default=0.8,
                             help='Minimum k-mer similarity (maximum jaccard containment of k-mers between pair of sequences) to cluster sequences into the same "nucleotide similarity cluster" (Default: 0.8))')
-
+
+    nrc_parser.add_argument('--is-rtab', action='store_true',
+                                                    help="Flag indicating that the input gene matrix is a tab-delimited .Rtab file")
+
     nrc_parser.set_defaults(func=_nrc_cli)
 
 
@@ -240,14 +246,14 @@ def main():
     utils_sub_parser = utils_parser.add_subparsers(required=True, help='Please select one of the utilility pipelines of the PanQC toolkit.')
 
     asmseqcheck_parser = utils_sub_parser.add_parser("asmseqcheck", help="")
-    asmseqcheck_parser.add_argument('-a', '--in_assemblies', type=str, required=True,
-                                    help="Paths to input assemblies. (TSV)")
+    asmseqcheck_parser.add_argument('-a', '--asms', type=str, required=True, metavar="PathToAsms.tsv",
+                                                    help="Table with SampleID & Paths to each input assemblies. (TSV)")
 
-    asmseqcheck_parser.add_argument('-r', '--in_pg_ref', type=str, required=True,
-                                    help="Input pan-genome nucleotide reference. Typically output as `pan_genome_reference.fasta` by Panaroo/Roary (FASTA)")
+    asmseqcheck_parser.add_argument('-r', '--pg-ref', type=str, required=True,  metavar="pan_genome_reference.fasta",
+                                                    help="Input pan-genome nucleotide reference. Typically output as `pan_genome_reference.fasta` by Panaroo/Roary (FASTA)")
 
-    asmseqcheck_parser.add_argument('-m', '--in_gene_matrix', type=str, required=True,
-                                    help="Input pan-genome gene presence/absence matrix. Typically output as `gene_presence_absence.csv` by Panaroo/Roary (CSV)")
+    asmseqcheck_parser.add_argument('-m', '--gene_matrix', type=str, required=True, metavar="gene_presence_absence.csv",
+                                                    help="Input pan-genome gene presence/absence matrix. By default is assumed to be a `gene_presence_absence.csv` output by Panaroo/Roary (CSV) \n If the user provides the --is-rtab flag, the input is assumed to be an .Rtab (TSV) file.")
 
     asmseqcheck_parser.add_argument('-o', '--out_gene_matrix_wi_geneseqcheck',type=str, required=True,
                                     help="Output pan-genome gene presence/absence matrix with updated gene presence/absence calls. (CSV). \n NOTE: 2 reflects that similar gene sequence is present at the nucleotide level (CSV)")
@@ -258,6 +264,10 @@ def main():
     asmseqcheck_parser.add_argument('-i', '--min_seq_id', type=float, default=0.9,
                             help="Minimum sequence identity to classify a gene as present within an assembly (0-1)")
 
+    asmseqcheck_parser.add_argument('--is-rtab', action='store_true',
+                                                    help="Flag indicating that the input gene matrix is a tab-delimited .Rtab file")
+
+
     asmseqcheck_parser.set_defaults(func=_asmseqcheck_cli)
 
 

diff --git a/panqc/asm_gene_search.py b/panqc/asm_gene_search.py
@@ -10,7 +10,7 @@
 # Set the logging level to INFO
 #logging.basicConfig(level=logging.INFO)
 
-from .utils import parse_PresAbs_CSV_General, get_columns_excluding, parse_PG_Ref_FA
+from .utils import parse_PresAbs_CSV_General, parse_PresAbs_Rtab, get_columns_excluding, parse_PG_Ref_FA
 
 
 #### Define function for parsing Mappy alignment hits
@@ -239,59 +239,58 @@ def create_LRandSR_AsmFA_PATH_Dict(i_AsmFA_LRandSR_TSV):
                                 'Order within Fragment', 'Accessory Fragment', 'Accessory Order with Fragment', 'QC',
                                 'Min group size nuc', 'Max group size nuc', 'Avg group size nuc']
 
-def asmseqcheck_frompaths(i_Gene_PresAbs_CSV_PATH,
+def asmseqcheck_frompaths(i_Gene_PresAbs_File_PATH,
                             i_PG_Ref_FA_PATH,
                             i_AsmFA_TSV,
-                            MinQueryCov, MinQuerySeqID):
+                            MinQueryCov,
+                            MinQuerySeqID,
+                            rtab_matrix = False):
     """
     Searches for gene sequences in genome assemblies and returns a DataFrame with presence/absence information.
 
     Args:
-        i_Gene_PresAbs_CSV_PATH (str): Path to the gene presence/absence matrix CSV file.
-        i_PG_Ref_FA_PATH (str): Path to the pan-genome (nucleotide) reference fasta file.
+        i_Gene_PresAbs_File_PATH (str): Path to the gene presence/absence matrix file.
+        i_PG_Ref_FA_PATH (str): Path to the pan-genome reference fasta file.
         i_AsmFA_TSV (str): Path to the TSV file containing the genome assembly fasta file paths.
         MinQueryCov (float): Minimum query coverage required for a gene sequence to be considered present.
         MinQuerySeqID (float): Minimum query sequence identity required for a gene sequence to be considered present.
+        rtab_matrix (bool): Flag indicating whether the gene presence/absence matrix is in .Rtab format.
 
     Returns:
         pandas.DataFrame: A DataFrame with presence/absence information for each gene in each sample.
     """
 
-    # 1) Parse in Gene Pres/Abs matrix and select sampleIDs
-
+    # Parse the gene presence/absence matrix and select sampleIDs
     print("Parsing input gene presence/absence matrix...")
-    Gene_PresAbs_DF = parse_PresAbs_CSV_General(i_Gene_PresAbs_CSV_PATH)
-
-    i_SampleIDs = get_columns_excluding(Gene_PresAbs_DF, PresAbs_NonSampleID_ColNames)
+    if rtab_matrix:
+        print("   Gene presence/absence matrix being parsed as TSV (.Rtab)")
+        Gene_PresAbs_DF = parse_PresAbs_Rtab(i_Gene_PresAbs_File_PATH)
+        print("   Dimensions of parsed matrix", Gene_PresAbs_DF.shape)
+    else:
+        print("   Gene presence/absence matrix being parsed as CSV")
+        Gene_PresAbs_DF = parse_PresAbs_CSV_General(i_Gene_PresAbs_File_PATH)
+        print("   Dimensions of parsed matrix", Gene_PresAbs_DF.shape)
 
+    i_SampleIDs = get_columns_excluding(Gene_PresAbs_DF, PresAbs_NonSampleID_ColNames)
     print("Finished parsing gene presence/absence matrix\n")
 
-    # 2) Read in pan-genome (nucleotide) reference fasta
+    # Read in the pan-genome reference fasta
     print("Parsing input gene reference fasta")
     PG_Ref_NucSeqs = parse_PG_Ref_FA(i_PG_Ref_FA_PATH)
     print("Finished parsing gene reference fasta\n")
 
-    # 3) Read in genome assembly fasta PATH TSV, convert to dict
+    # Read in the genome assembly fasta PATH TSV and convert to dict
     print("Parsing TSV of input assembly PATHs")
     AsmFA_Dict = create_AsmFA_PATH_Dict(i_AsmFA_TSV)
     print("Finished parsing input assembly PATHs \n")
-
-    # 4) Run alignments to check for gene sequences in assemblies
-    print("Running alignments to check for gene sequences in assemblies...")
-
-    # Gene_PresAbs_WiAsmSeqCheck_DF = PresAbsQC_CheckAsmForGeneSeq(Gene_PresAbs_DF, PG_Ref_NucSeqs,
-    #                                                             AsmFA_Dict, i_SampleIDs,
-    #                                                             MinQueryCov, MinQuerySeqID)
 
+    # Run alignments to check for gene sequences in assemblies
+    print("Running alignments to check for gene sequences in assemblies...")
     Gene_PresAbs_WiAsmSeqCheck_DF = PresAbsQC_CheckAsmForGeneSeq_V2(Gene_PresAbs_DF, PG_Ref_NucSeqs,
                                                                  AsmFA_Dict, i_SampleIDs,
                                                                  MinQueryCov, MinQuerySeqID)
-
-
-
     print("Finished searching for gene sequences in assemblies \n")
 
-
     return Gene_PresAbs_WiAsmSeqCheck_DF
 
 

diff --git a/panqc/utils.py b/panqc/utils.py
@@ -7,7 +7,8 @@
 
 def parse_PresAbs_Rtab(PresAbs_Rtab_PATH):
     '''
-    This function parsesthe `gene_presence_absence.csv` file output by Panaroo '''
+    This function parses the `gene_presence_absence.Rtab` file output by many pan-genome analysis softwares (Panaroo, Roary)
+    '''
 
     i_Gene_PresAbs_DF = pd.read_csv(PresAbs_Rtab_PATH, sep = "\t")
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
 name = "panqc"
 description = "Package & software for analysis of nucleotide redundancy within CDS-based pan-genome analyses"
 readme = "README.md"
-version = "0.0.3"
+version = "0.0.4"
 authors = [
     { name = "Maximillian Marin", email = "[email protected]" }
 ]